diff options
Diffstat (limited to 'llvm/test/CodeGen')
38 files changed, 7377 insertions, 5179 deletions
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll index cbacd17..0e15b94 100644 --- a/llvm/test/CodeGen/AArch64/andcompare.ll +++ b/llvm/test/CodeGen/AArch64/andcompare.ll @@ -1,23 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i32 @and_eq_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, eq -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, eq +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -27,21 +27,21 @@ entry: } define i32 @and_eq_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, eq -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, eq +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -51,21 +51,21 @@ entry: } define i32 @and_eq_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, eq -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, eq +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -75,21 +75,21 @@ entry: } define i32 @and_eq_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, eq -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, eq +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -99,21 +99,21 @@ entry: } define i32 @and_eq_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, eq -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, eq +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -123,21 +123,21 @@ entry: } define i32 @and_eq_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, eq -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, eq +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -147,21 +147,21 @@ entry: } define i32 @and_eq_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, eq -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, eq +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -171,21 +171,21 @@ entry: } define i32 @and_eq_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, eq -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, eq +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -195,21 +195,21 @@ entry: } define i32 @and_eq_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, eq -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, eq +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -219,21 +219,21 @@ entry: } define i32 @and_eq_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_eq_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, eq -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_eq_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, eq +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_eq_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -243,21 +243,21 @@ entry: } define i32 @and_ne_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ne -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ne +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -267,21 +267,21 @@ entry: } define i32 @and_ne_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ne -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ne +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -291,21 +291,21 @@ entry: } define i32 @and_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ne -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ne +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -315,21 +315,21 @@ entry: } define i32 @and_ne_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ne -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ne +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -339,21 +339,21 @@ entry: } define i32 @and_ne_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ne -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ne +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -363,21 +363,21 @@ entry: } define i32 @and_ne_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ne -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ne +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -387,21 +387,21 @@ entry: } define i32 @and_ne_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ne -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ne +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -411,21 +411,21 @@ entry: } define i32 @and_ne_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ne -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ne +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -435,21 +435,21 @@ entry: } define i32 @and_ne_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ne -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ne +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -459,21 +459,21 @@ entry: } define i32 @and_ne_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ne_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, ne -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ne_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, ne +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ne_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -483,21 +483,21 @@ entry: } define i32 @and_ult_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -507,21 +507,21 @@ entry: } define i32 @and_ult_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, lo -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, lo +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -531,21 +531,21 @@ entry: } define i32 @and_ult_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, lo -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, lo +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -555,21 +555,21 @@ entry: } define i32 @and_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, lo -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, lo +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -579,21 +579,21 @@ entry: } define i32 @and_ult_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -603,21 +603,21 @@ entry: } define i32 @and_ult_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -627,21 +627,21 @@ entry: } define i32 @and_ult_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -651,21 +651,21 @@ entry: } define i32 @and_ult_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -675,21 +675,21 @@ entry: } define i32 @and_ult_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, lo -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, lo +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -699,21 +699,21 @@ entry: } define i32 @and_ult_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ult_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, lo -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ult_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, lo +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ult_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -723,21 +723,21 @@ entry: } define i32 @and_ule_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ls -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ls +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -747,21 +747,21 @@ entry: } define i32 @and_ule_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ls -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ls +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -771,21 +771,21 @@ entry: } define i32 @and_ule_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ls -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ls +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -795,21 +795,21 @@ entry: } define i32 @and_ule_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ls -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ls +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -819,21 +819,21 @@ entry: } define i32 @and_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ls -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ls +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -843,21 +843,21 @@ entry: } define i32 @and_ule_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ls -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ls +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -867,21 +867,21 @@ entry: } define i32 @and_ule_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ls -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ls +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -891,21 +891,21 @@ entry: } define i32 @and_ule_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ls -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ls +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -915,21 +915,21 @@ entry: } define i32 @and_ule_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ls -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ls +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -939,21 +939,21 @@ entry: } define i32 @and_ule_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ule_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, ls -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ule_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, ls +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ule_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -963,21 +963,21 @@ entry: } define i32 @and_ugt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hi -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hi +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -987,21 +987,21 @@ entry: } define i32 @and_ugt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, hi -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, hi +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1011,21 +1011,21 @@ entry: } define i32 @and_ugt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hi -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hi +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -1035,21 +1035,21 @@ entry: } define i32 @and_ugt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hi -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hi +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -1059,21 +1059,21 @@ entry: } define i32 @and_ugt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hi -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hi +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -1083,21 +1083,21 @@ entry: } define i32 @and_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hi -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hi +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -1107,21 +1107,21 @@ entry: } define i32 @and_ugt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hi -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hi +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -1131,21 +1131,21 @@ entry: } define i32 @and_ugt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hi -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hi +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -1155,21 +1155,21 @@ entry: } define i32 @and_ugt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, hi -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, hi +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -1179,21 +1179,21 @@ entry: } define i32 @and_ugt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_ugt_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, hi -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_ugt_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, hi +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_ugt_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -1203,21 +1203,21 @@ entry: } define i32 @and_uge_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -1227,21 +1227,21 @@ entry: } define i32 @and_uge_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, hs -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, hs +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1251,21 +1251,21 @@ entry: } define i32 @and_uge_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hs -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hs +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -1275,21 +1275,21 @@ entry: } define i32 @and_uge_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hs -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hs +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -1299,21 +1299,21 @@ entry: } define i32 @and_uge_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -1323,21 +1323,21 @@ entry: } define i32 @and_uge_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -1347,21 +1347,21 @@ entry: } define i32 @and_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -1371,21 +1371,21 @@ entry: } define i32 @and_uge_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -1395,21 +1395,21 @@ entry: } define i32 @and_uge_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, hs -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, hs +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -1419,21 +1419,21 @@ entry: } define i32 @and_uge_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_uge_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, hs -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_uge_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, hs +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_uge_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -1443,21 +1443,21 @@ entry: } define i32 @and_slt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lt -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lt +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -1467,21 +1467,21 @@ entry: } define i32 @and_slt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, lt -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, lt +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1491,21 +1491,21 @@ entry: } define i32 @and_slt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, lt -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, lt +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -1515,21 +1515,21 @@ entry: } define i32 @and_slt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, lt -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, lt +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -1539,21 +1539,21 @@ entry: } define i32 @and_slt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lt -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lt +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -1563,21 +1563,21 @@ entry: } define i32 @and_slt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lt -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lt +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -1587,21 +1587,21 @@ entry: } define i32 @and_slt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lt -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lt +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -1611,21 +1611,21 @@ entry: } define i32 @and_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lt -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lt +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -1635,21 +1635,21 @@ entry: } define i32 @and_slt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, lt -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, lt +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -1659,21 +1659,21 @@ entry: } define i32 @and_slt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_slt_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, lt -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_slt_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, lt +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_slt_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -1683,21 +1683,21 @@ entry: } define i32 @and_sle_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, le -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, le +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -1707,21 +1707,21 @@ entry: } define i32 @and_sle_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, le -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, le +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1731,21 +1731,21 @@ entry: } define i32 @and_sle_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, le -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, le +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -1755,21 +1755,21 @@ entry: } define i32 @and_sle_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, le -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, le +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -1779,21 +1779,21 @@ entry: } define i32 @and_sle_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, le -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, le +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -1803,21 +1803,21 @@ entry: } define i32 @and_sle_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, le -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, le +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -1827,21 +1827,21 @@ entry: } define i32 @and_sle_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, le -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, le +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -1851,21 +1851,21 @@ entry: } define i32 @and_sle_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, le -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, le +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -1875,21 +1875,21 @@ entry: } define i32 @and_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, le -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, le +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -1899,21 +1899,21 @@ entry: } define i32 @and_sle_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sle_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, le -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sle_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, le +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sle_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -1923,21 +1923,21 @@ entry: } define i32 @and_sgt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, gt -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, gt +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -1947,21 +1947,21 @@ entry: } define i32 @and_sgt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, gt -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, gt +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1971,21 +1971,21 @@ entry: } define i32 @and_sgt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, gt -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, gt +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -1995,21 +1995,21 @@ entry: } define i32 @and_sgt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, gt -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, gt +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -2019,21 +2019,21 @@ entry: } define i32 @and_sgt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, gt -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, gt +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -2043,21 +2043,21 @@ entry: } define i32 @and_sgt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, gt -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, gt +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -2067,21 +2067,21 @@ entry: } define i32 @and_sgt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, gt -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, gt +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -2091,21 +2091,21 @@ entry: } define i32 @and_sgt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, gt -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, gt +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -2115,21 +2115,21 @@ entry: } define i32 @and_sgt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, gt -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, gt +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -2139,21 +2139,21 @@ entry: } define i32 @and_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sgt_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, gt -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sgt_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sgt_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, gt +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sgt_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -2163,21 +2163,21 @@ entry: } define i32 @and_sge_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_eq: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ge -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_eq: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_eq: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ge +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_eq: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp eq i32 %s2, %s3 @@ -2187,21 +2187,21 @@ entry: } define i32 @and_sge_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_ne: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ge -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_ne: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_ne: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ge +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_ne: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -2211,21 +2211,21 @@ entry: } define i32 @and_sge_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ge -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ge +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -2235,21 +2235,21 @@ entry: } define i32 @and_sge_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, ge -; SDISEL-NEXT: cset w0, ls -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, ge +; CHECK-SD-NEXT: cset w0, ls +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -2259,21 +2259,21 @@ entry: } define i32 @and_sge_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ge -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ge +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -2283,21 +2283,21 @@ entry: } define i32 @and_sge_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ge -; SDISEL-NEXT: cset w0, hs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ge +; CHECK-SD-NEXT: cset w0, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -2307,21 +2307,21 @@ entry: } define i32 @and_sge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ge -; SDISEL-NEXT: cset w0, lt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ge +; CHECK-SD-NEXT: cset w0, lt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -2331,21 +2331,21 @@ entry: } define i32 @and_sge_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, ge -; SDISEL-NEXT: cset w0, le -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, ge +; CHECK-SD-NEXT: cset w0, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -2355,21 +2355,21 @@ entry: } define i32 @and_sge_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #4, ge -; SDISEL-NEXT: cset w0, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #4, ge +; CHECK-SD-NEXT: cset w0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 @@ -2379,21 +2379,21 @@ entry: } define i32 @and_sge_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) { -; SDISEL-LABEL: and_sge_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #8, ge -; SDISEL-NEXT: cset w0, ge -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_sge_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ge -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_sge_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #8, ge +; CHECK-SD-NEXT: cset w0, ge +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_sge_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ge +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %c0 = icmp sge i32 %s0, %s1 %c1 = icmp sge i32 %s2, %s3 @@ -2403,19 +2403,19 @@ entry: } define i32 @cmp_to_ands1(i32 %num) { -; SDISEL-LABEL: cmp_to_ands1: -; SDISEL: // %bb.0: -; SDISEL-NEXT: and w8, w0, #0xff -; SDISEL-NEXT: tst w0, #0xfe -; SDISEL-NEXT: csel w0, w8, wzr, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands1: -; GISEL: // %bb.0: -; GISEL-NEXT: and w8, w0, #0xff -; GISEL-NEXT: cmp w8, #1 -; GISEL-NEXT: csel w0, w8, wzr, hi -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: tst w0, #0xfe +; CHECK-SD-NEXT: csel w0, w8, wzr, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: csel w0, w8, wzr, hi +; CHECK-GI-NEXT: ret %and = and i32 %num, 255 %cmp = icmp ugt i32 %and, 1 %r = select i1 %cmp, i32 %and, i32 0 @@ -2423,19 +2423,19 @@ define i32 @cmp_to_ands1(i32 %num) { } define i32 @cmp_to_ands2(i32 %num) { -; SDISEL-LABEL: cmp_to_ands2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: and w8, w0, #0xfe -; SDISEL-NEXT: tst w0, #0xc0 -; SDISEL-NEXT: csel w0, w8, wzr, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands2: -; GISEL: // %bb.0: -; GISEL-NEXT: and w8, w0, #0xfe -; GISEL-NEXT: cmp w8, #63 -; GISEL-NEXT: csel w0, w8, wzr, hi -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xfe +; CHECK-SD-NEXT: tst w0, #0xc0 +; CHECK-SD-NEXT: csel w0, w8, wzr, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xfe +; CHECK-GI-NEXT: cmp w8, #63 +; CHECK-GI-NEXT: csel w0, w8, wzr, hi +; CHECK-GI-NEXT: ret %and = and i32 %num, 254 %cmp = icmp ugt i32 %and, 63 %r = select i1 %cmp, i32 %and, i32 0 @@ -2443,19 +2443,19 @@ define i32 @cmp_to_ands2(i32 %num) { } define i32 @cmp_to_ands3(i32 %num, i32 %a) { -; SDISEL-LABEL: cmp_to_ands3: -; SDISEL: // %bb.0: -; SDISEL-NEXT: tst w0, #0x10 -; SDISEL-NEXT: csel w0, w1, wzr, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands3: -; GISEL: // %bb.0: -; GISEL-NEXT: mov w8, #23 // =0x17 -; GISEL-NEXT: and w8, w0, w8 -; GISEL-NEXT: cmp w8, #7 -; GISEL-NEXT: csel w0, w1, wzr, hi -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: tst w0, #0x10 +; CHECK-SD-NEXT: csel w0, w1, wzr, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #23 // =0x17 +; CHECK-GI-NEXT: and w8, w0, w8 +; CHECK-GI-NEXT: cmp w8, #7 +; CHECK-GI-NEXT: csel w0, w1, wzr, hi +; CHECK-GI-NEXT: ret %and = and i32 %num, 23 %cmp = icmp ugt i32 %and, 7 %r = select i1 %cmp, i32 %a, i32 0 @@ -2463,19 +2463,19 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) { } define i32 @cmp_to_ands4(i32 %num, i32 %a) { -; SDISEL-LABEL: cmp_to_ands4: -; SDISEL: // %bb.0: -; SDISEL-NEXT: and w8, w0, #0x30 -; SDISEL-NEXT: tst w0, #0x20 -; SDISEL-NEXT: csel w0, w8, w1, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands4: -; GISEL: // %bb.0: -; GISEL-NEXT: and w8, w0, #0x30 -; GISEL-NEXT: cmp w8, #31 -; GISEL-NEXT: csel w0, w8, w1, ls -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0x30 +; CHECK-SD-NEXT: tst w0, #0x20 +; CHECK-SD-NEXT: csel w0, w8, w1, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0x30 +; CHECK-GI-NEXT: cmp w8, #31 +; CHECK-GI-NEXT: csel w0, w8, w1, ls +; CHECK-GI-NEXT: ret %and = and i32 %num, 48 %cmp = icmp ule i32 %and, 31 %r = select i1 %cmp, i32 %and, i32 %a @@ -2483,19 +2483,19 @@ define i32 @cmp_to_ands4(i32 %num, i32 %a) { } define i32 @cmp_to_ands5(i32 %num, i32 %a) { -; SDISEL-LABEL: cmp_to_ands5: -; SDISEL: // %bb.0: -; SDISEL-NEXT: and w8, w0, #0xf8 -; SDISEL-NEXT: tst w0, #0xc0 -; SDISEL-NEXT: csel w0, w8, w1, eq -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands5: -; GISEL: // %bb.0: -; GISEL-NEXT: and w8, w0, #0xf8 -; GISEL-NEXT: cmp w8, #64 -; GISEL-NEXT: csel w0, w8, w1, lo -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands5: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xf8 +; CHECK-SD-NEXT: tst w0, #0xc0 +; CHECK-SD-NEXT: csel w0, w8, w1, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands5: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xf8 +; CHECK-GI-NEXT: cmp w8, #64 +; CHECK-GI-NEXT: csel w0, w8, w1, lo +; CHECK-GI-NEXT: ret %and = and i32 %num, 248 %cmp = icmp ult i32 %and, 64 %r = select i1 %cmp, i32 %and, i32 %a @@ -2503,19 +2503,19 @@ define i32 @cmp_to_ands5(i32 %num, i32 %a) { } define i32 @cmp_to_ands6(i32 %num) { -; SDISEL-LABEL: cmp_to_ands6: -; SDISEL: // %bb.0: -; SDISEL-NEXT: and w8, w0, #0xfe -; SDISEL-NEXT: tst w0, #0xf0 -; SDISEL-NEXT: csel w0, w8, wzr, ne -; SDISEL-NEXT: ret -; -; GISEL-LABEL: cmp_to_ands6: -; GISEL: // %bb.0: -; GISEL-NEXT: and w8, w0, #0xfe -; GISEL-NEXT: cmp w8, #16 -; GISEL-NEXT: csel w0, w8, wzr, hs -; GISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_to_ands6: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xfe +; CHECK-SD-NEXT: tst w0, #0xf0 +; CHECK-SD-NEXT: csel w0, w8, wzr, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmp_to_ands6: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xfe +; CHECK-GI-NEXT: cmp w8, #16 +; CHECK-GI-NEXT: csel w0, w8, wzr, hs +; CHECK-GI-NEXT: ret %and = and i32 %num, 254 %cmp = icmp uge i32 %and, 16 %r = select i1 %cmp, i32 %and, i32 0 @@ -2523,21 +2523,21 @@ define i32 @cmp_to_ands6(i32 %num) { } define i1 @and_fcmp(float %0, float %1) { -; SDISEL-LABEL: and_fcmp: -; SDISEL: // %bb.0: -; SDISEL-NEXT: fcmp s1, s1 -; SDISEL-NEXT: fccmp s0, s0, #0, vs -; SDISEL-NEXT: cset w0, vs -; SDISEL-NEXT: ret -; -; GISEL-LABEL: and_fcmp: -; GISEL: // %bb.0: -; GISEL-NEXT: fcmp s0, #0.0 -; GISEL-NEXT: cset w8, vs -; GISEL-NEXT: fcmp s1, #0.0 -; GISEL-NEXT: cset w9, vs -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-SD-LABEL: and_fcmp: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcmp s1, s1 +; CHECK-SD-NEXT: fccmp s0, s0, #0, vs +; CHECK-SD-NEXT: cset w0, vs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_fcmp: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcmp s0, #0.0 +; CHECK-GI-NEXT: cset w8, vs +; CHECK-GI-NEXT: fcmp s1, #0.0 +; CHECK-GI-NEXT: cset w9, vs +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret %3 = fcmp uno float %0, 0.000000e+00 %4 = fcmp uno float %1, 0.000000e+00 diff --git a/llvm/test/CodeGen/AArch64/andorbrcompare.ll b/llvm/test/CodeGen/AArch64/andorbrcompare.ll index 951a5cd..5bc06ec 100644 --- a/llvm/test/CodeGen/AArch64/andorbrcompare.ll +++ b/llvm/test/CodeGen/AArch64/andorbrcompare.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare void @dummy() define i32 @and_eq_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_eq_ne_ult: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #0, ne -; SDISEL-NEXT: b.eq .LBB0_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.lo .LBB0_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB0_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_eq_ne_ult: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #0, ne +; CHECK-SD-NEXT: b.eq .LBB0_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.lo .LBB0_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB0_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_eq_ne_ult: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB0_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.lo .LBB0_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB0_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_eq_ne_ult: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB0_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.lo .LBB0_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB0_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp eq i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -56,40 +56,40 @@ else: } define i32 @and_ne_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_ne_ult_ule: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #4, lo -; SDISEL-NEXT: b.ne .LBB1_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.ls .LBB1_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB1_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_ne_ult_ule: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #4, lo +; CHECK-SD-NEXT: b.ne .LBB1_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.ls .LBB1_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB1_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_ne_ult_ule: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB1_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.ls .LBB1_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB1_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_ne_ult_ule: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB1_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.ls .LBB1_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB1_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp ne i32 %s0, %s1 %c1 = icmp ult i32 %s2, %s3 @@ -107,40 +107,40 @@ else: } define i32 @and_ult_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_ult_ule_ugt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #2, ls -; SDISEL-NEXT: b.lo .LBB2_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.hi .LBB2_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB2_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_ult_ule_ugt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #2, ls +; CHECK-SD-NEXT: b.lo .LBB2_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.hi .LBB2_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB2_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_ult_ule_ugt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ls -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB2_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.hi .LBB2_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB2_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_ult_ule_ugt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ls +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB2_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.hi .LBB2_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB2_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp ult i32 %s0, %s1 %c1 = icmp ule i32 %s2, %s3 @@ -158,40 +158,40 @@ else: } define i32 @and_ule_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_ule_ugt_uge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #2, hi -; SDISEL-NEXT: b.ls .LBB3_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.hs .LBB3_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB3_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_ule_ugt_uge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #2, hi +; CHECK-SD-NEXT: b.ls .LBB3_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.hs .LBB3_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB3_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_ule_ugt_uge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB3_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.hs .LBB3_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB3_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_ule_ugt_uge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB3_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.hs .LBB3_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB3_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp ule i32 %s0, %s1 %c1 = icmp ugt i32 %s2, %s3 @@ -209,40 +209,40 @@ else: } define i32 @and_ugt_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_ugt_uge_slt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #0, hs -; SDISEL-NEXT: b.hi .LBB4_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.lt .LBB4_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB4_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_ugt_uge_slt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #0, hs +; CHECK-SD-NEXT: b.hi .LBB4_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.lt .LBB4_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB4_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_ugt_uge_slt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hs -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB4_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.lt .LBB4_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB4_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_ugt_uge_slt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hs +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB4_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.lt .LBB4_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB4_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp ugt i32 %s0, %s1 %c1 = icmp uge i32 %s2, %s3 @@ -260,40 +260,40 @@ else: } define i32 @and_uge_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_uge_slt_sle: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #0, lt -; SDISEL-NEXT: b.hs .LBB5_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.le .LBB5_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB5_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_uge_slt_sle: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #0, lt +; CHECK-SD-NEXT: b.hs .LBB5_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.le .LBB5_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB5_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_uge_slt_sle: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, lt -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB5_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.le .LBB5_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB5_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_uge_slt_sle: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, lt +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB5_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.le .LBB5_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB5_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp uge i32 %s0, %s1 %c1 = icmp slt i32 %s2, %s3 @@ -311,40 +311,40 @@ else: } define i32 @and_slt_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_slt_sle_sgt: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #0, le -; SDISEL-NEXT: b.lt .LBB6_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.gt .LBB6_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB6_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_slt_sle_sgt: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #0, le +; CHECK-SD-NEXT: b.lt .LBB6_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.gt .LBB6_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB6_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_slt_sle_sgt: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, le -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB6_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.gt .LBB6_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB6_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_slt_sle_sgt: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, le +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB6_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.gt .LBB6_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB6_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp slt i32 %s0, %s1 %c1 = icmp sle i32 %s2, %s3 @@ -362,40 +362,40 @@ else: } define i32 @and_sle_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) { -; SDISEL-LABEL: and_sle_sgt_sge: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #0, gt -; SDISEL-NEXT: b.le .LBB7_3 -; SDISEL-NEXT: // %bb.1: // %entry -; SDISEL-NEXT: cmp w4, w5 -; SDISEL-NEXT: b.ge .LBB7_3 -; SDISEL-NEXT: // %bb.2: -; SDISEL-NEXT: mov w0, wzr -; SDISEL-NEXT: ret -; SDISEL-NEXT: .LBB7_3: // %if -; SDISEL-NEXT: mov w0, #1 // =0x1 -; SDISEL-NEXT: str w0, [x6] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: and_sle_sgt_sge: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #0, gt +; CHECK-SD-NEXT: b.le .LBB7_3 +; CHECK-SD-NEXT: // %bb.1: // %entry +; CHECK-SD-NEXT: cmp w4, w5 +; CHECK-SD-NEXT: b.ge .LBB7_3 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB7_3: // %if +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: str w0, [x6] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: and_sle_sgt_sge: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, le -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, gt -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: tbnz w8, #0, .LBB7_3 -; GISEL-NEXT: // %bb.1: // %entry -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: mov w0, wzr -; GISEL-NEXT: b.ge .LBB7_3 -; GISEL-NEXT: // %bb.2: // %common.ret -; GISEL-NEXT: ret -; GISEL-NEXT: .LBB7_3: // %if -; GISEL-NEXT: mov w0, #1 // =0x1 -; GISEL-NEXT: str w0, [x6] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: and_sle_sgt_sge: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, le +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, gt +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: tbnz w8, #0, .LBB7_3 +; CHECK-GI-NEXT: // %bb.1: // %entry +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: b.ge .LBB7_3 +; CHECK-GI-NEXT: // %bb.2: // %common.ret +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB7_3: // %if +; CHECK-GI-NEXT: mov w0, #1 // =0x1 +; CHECK-GI-NEXT: str w0, [x6] +; CHECK-GI-NEXT: ret entry: %c0 = icmp sle i32 %s0, %s1 %c1 = icmp sgt i32 %s2, %s3 diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll index 06e957f..a546ffd 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI target triple = "arm64-apple-ios" define i32 @single_same(i32 %a, i32 %b) nounwind ssp { @@ -32,31 +32,31 @@ if.end: ; Different condition codes for the two compares. define i32 @single_different(i32 %a, i32 %b) nounwind ssp { -; SDISEL-LABEL: single_different: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: cmp w0, #6 -; SDISEL-NEXT: ccmp w1, #17, #0, ge -; SDISEL-NEXT: b.eq LBB1_2 -; SDISEL-NEXT: ; %bb.1: ; %if.then -; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; SDISEL-NEXT: bl _foo -; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: LBB1_2: ; %if.end -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: single_different: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: cmp w0, #6 +; CHECK-SD-NEXT: ccmp w1, #17, #0, ge +; CHECK-SD-NEXT: b.eq LBB1_2 +; CHECK-SD-NEXT: ; %bb.1: ; %if.then +; CHECK-SD-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: bl _foo +; CHECK-SD-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-SD-NEXT: LBB1_2: ; %if.end +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: single_different: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: cmp w0, #5 -; GISEL-NEXT: ccmp w1, #17, #0, gt -; GISEL-NEXT: b.eq LBB1_2 -; GISEL-NEXT: ; %bb.1: ; %if.then -; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; GISEL-NEXT: bl _foo -; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: LBB1_2: ; %if.end -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: single_different: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: cmp w0, #5 +; CHECK-GI-NEXT: ccmp w1, #17, #0, gt +; CHECK-GI-NEXT: b.eq LBB1_2 +; CHECK-GI-NEXT: ; %bb.1: ; %if.then +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: bl _foo +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-GI-NEXT: LBB1_2: ; %if.end +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret entry: %cmp = icmp sle i32 %a, 5 %cmp1 = icmp ne i32 %b, 17 @@ -73,41 +73,41 @@ if.end: ; Second block clobbers the flags, can't convert (easily). define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp { -; SDISEL-LABEL: single_flagclobber: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: cmp w0, #5 -; SDISEL-NEXT: b.eq LBB2_2 -; SDISEL-NEXT: ; %bb.1: ; %lor.lhs.false -; SDISEL-NEXT: lsl w8, w1, #1 -; SDISEL-NEXT: cmp w1, #7 -; SDISEL-NEXT: csinc w8, w8, w1, lt -; SDISEL-NEXT: cmp w8, #16 -; SDISEL-NEXT: b.gt LBB2_3 -; SDISEL-NEXT: LBB2_2: ; %if.then -; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; SDISEL-NEXT: bl _foo -; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: LBB2_3: ; %if.end -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: single_flagclobber: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: cmp w0, #5 +; CHECK-SD-NEXT: b.eq LBB2_2 +; CHECK-SD-NEXT: ; %bb.1: ; %lor.lhs.false +; CHECK-SD-NEXT: lsl w8, w1, #1 +; CHECK-SD-NEXT: cmp w1, #7 +; CHECK-SD-NEXT: csinc w8, w8, w1, lt +; CHECK-SD-NEXT: cmp w8, #16 +; CHECK-SD-NEXT: b.gt LBB2_3 +; CHECK-SD-NEXT: LBB2_2: ; %if.then +; CHECK-SD-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: bl _foo +; CHECK-SD-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-SD-NEXT: LBB2_3: ; %if.end +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: single_flagclobber: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: cmp w0, #5 -; GISEL-NEXT: b.eq LBB2_2 -; GISEL-NEXT: ; %bb.1: ; %lor.lhs.false -; GISEL-NEXT: lsl w8, w1, #1 -; GISEL-NEXT: cmp w1, #7 -; GISEL-NEXT: csinc w8, w8, w1, lt -; GISEL-NEXT: cmp w8, #17 -; GISEL-NEXT: b.ge LBB2_3 -; GISEL-NEXT: LBB2_2: ; %if.then -; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; GISEL-NEXT: bl _foo -; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: LBB2_3: ; %if.end -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: single_flagclobber: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: cmp w0, #5 +; CHECK-GI-NEXT: b.eq LBB2_2 +; CHECK-GI-NEXT: ; %bb.1: ; %lor.lhs.false +; CHECK-GI-NEXT: lsl w8, w1, #1 +; CHECK-GI-NEXT: cmp w1, #7 +; CHECK-GI-NEXT: csinc w8, w8, w1, lt +; CHECK-GI-NEXT: cmp w8, #17 +; CHECK-GI-NEXT: b.ge LBB2_3 +; CHECK-GI-NEXT: LBB2_2: ; %if.then +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: bl _foo +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-GI-NEXT: LBB2_3: ; %if.end +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 br i1 %cmp, label %if.then, label %lor.lhs.false @@ -171,37 +171,37 @@ if.end: ; preds = %if.then, %lor.lhs.f ; The sdiv/udiv instructions do not trap when the divisor is zero, so they are ; safe to speculate. define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp { -; SDISEL-LABEL: speculate_division: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: cmp w0, #1 -; SDISEL-NEXT: sdiv w8, w1, w0 -; SDISEL-NEXT: ccmp w8, #16, #0, ge -; SDISEL-NEXT: b.le LBB4_2 -; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret -; SDISEL-NEXT: LBB4_2: ; %if.then -; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; SDISEL-NEXT: bl _foo -; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: speculate_division: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: cmp w0, #1 +; CHECK-SD-NEXT: sdiv w8, w1, w0 +; CHECK-SD-NEXT: ccmp w8, #16, #0, ge +; CHECK-SD-NEXT: b.le LBB4_2 +; CHECK-SD-NEXT: ; %bb.1: ; %if.end +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: LBB4_2: ; %if.then +; CHECK-SD-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: bl _foo +; CHECK-SD-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: speculate_division: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: cmp w0, #0 -; GISEL-NEXT: sdiv w8, w1, w0 -; GISEL-NEXT: ccmp w8, #17, #0, gt -; GISEL-NEXT: b.lt LBB4_2 -; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret -; GISEL-NEXT: LBB4_2: ; %if.then -; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; GISEL-NEXT: bl _foo -; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: speculate_division: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: sdiv w8, w1, w0 +; CHECK-GI-NEXT: ccmp w8, #17, #0, gt +; CHECK-GI-NEXT: b.lt LBB4_2 +; CHECK-GI-NEXT: ; %bb.1: ; %if.end +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: LBB4_2: ; %if.then +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: bl _foo +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 br i1 %cmp, label %land.lhs.true, label %if.end @@ -221,41 +221,41 @@ if.end: ; Floating point compare. define i32 @single_fcmp(i32 %a, float %b) nounwind ssp { -; SDISEL-LABEL: single_fcmp: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: cmp w0, #1 -; SDISEL-NEXT: scvtf s1, w0 -; SDISEL-NEXT: fdiv s0, s0, s1 -; SDISEL-NEXT: fmov s1, #17.00000000 -; SDISEL-NEXT: fccmp s0, s1, #8, ge -; SDISEL-NEXT: b.ge LBB5_2 -; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret -; SDISEL-NEXT: LBB5_2: ; %if.then -; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; SDISEL-NEXT: bl _foo -; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 ; =0x7 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: single_fcmp: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: cmp w0, #1 +; CHECK-SD-NEXT: scvtf s1, w0 +; CHECK-SD-NEXT: fdiv s0, s0, s1 +; CHECK-SD-NEXT: fmov s1, #17.00000000 +; CHECK-SD-NEXT: fccmp s0, s1, #8, ge +; CHECK-SD-NEXT: b.ge LBB5_2 +; CHECK-SD-NEXT: ; %bb.1: ; %if.end +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: LBB5_2: ; %if.then +; CHECK-SD-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: bl _foo +; CHECK-SD-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-SD-NEXT: mov w0, #7 ; =0x7 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: single_fcmp: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: cmp w0, #0 -; GISEL-NEXT: scvtf s1, w0 -; GISEL-NEXT: fdiv s0, s0, s1 -; GISEL-NEXT: fmov s1, #17.00000000 -; GISEL-NEXT: fccmp s0, s1, #8, gt -; GISEL-NEXT: b.ge LBB5_2 -; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret -; GISEL-NEXT: LBB5_2: ; %if.then -; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; GISEL-NEXT: bl _foo -; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 ; =0x7 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: single_fcmp: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: scvtf s1, w0 +; CHECK-GI-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NEXT: fmov s1, #17.00000000 +; CHECK-GI-NEXT: fccmp s0, s1, #8, gt +; CHECK-GI-NEXT: b.ge LBB5_2 +; CHECK-GI-NEXT: ; %bb.1: ; %if.end +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: LBB5_2: ; %if.then +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: bl _foo +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-GI-NEXT: mov w0, #7 ; =0x7 +; CHECK-GI-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 br i1 %cmp, label %land.lhs.true, label %if.end @@ -499,28 +499,28 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) { } define i64 @gccbug(i64 %x0, i64 %x1) { -; SDISEL-LABEL: gccbug: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmp x0, #2 -; SDISEL-NEXT: ccmp x0, #4, #4, ne -; SDISEL-NEXT: ccmp x1, #0, #0, eq -; SDISEL-NEXT: mov w8, #1 ; =0x1 -; SDISEL-NEXT: cinc x0, x8, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: gccbug: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmp x0, #2 +; CHECK-SD-NEXT: ccmp x0, #4, #4, ne +; CHECK-SD-NEXT: ccmp x1, #0, #0, eq +; CHECK-SD-NEXT: mov w8, #1 ; =0x1 +; CHECK-SD-NEXT: cinc x0, x8, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: gccbug: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmp x1, #0 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmp x0, #2 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: cmp x0, #4 -; GISEL-NEXT: cset w10, eq -; GISEL-NEXT: orr w9, w10, w9 -; GISEL-NEXT: and w8, w9, w8 -; GISEL-NEXT: and x8, x8, #0x1 -; GISEL-NEXT: add x0, x8, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: gccbug: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmp x1, #0 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmp x0, #2 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: cmp x0, #4 +; CHECK-GI-NEXT: cset w10, eq +; CHECK-GI-NEXT: orr w9, w10, w9 +; CHECK-GI-NEXT: and w8, w9, w8 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: add x0, x8, #1 +; CHECK-GI-NEXT: ret %cmp0 = icmp eq i64 %x1, 0 %cmp1 = icmp eq i64 %x0, 2 %cmp2 = icmp eq i64 %x0, 4 @@ -570,23 +570,23 @@ define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) { } define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) { -; SDISEL-LABEL: select_andor32: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmp w1, w2 -; SDISEL-NEXT: mov w8, #32 ; =0x20 -; SDISEL-NEXT: ccmp w0, w8, #4, lt -; SDISEL-NEXT: ccmp w0, w1, #0, eq -; SDISEL-NEXT: csel w0, w0, w1, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: select_andor32: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmp w1, w2 +; CHECK-SD-NEXT: mov w8, #32 ; =0x20 +; CHECK-SD-NEXT: ccmp w0, w8, #4, lt +; CHECK-SD-NEXT: ccmp w0, w1, #0, eq +; CHECK-SD-NEXT: csel w0, w0, w1, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: select_andor32: -; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #32 ; =0x20 -; GISEL-NEXT: cmp w1, w2 -; GISEL-NEXT: ccmp w0, w8, #4, lt -; GISEL-NEXT: ccmp w0, w1, #0, eq -; GISEL-NEXT: csel w0, w0, w1, eq -; GISEL-NEXT: ret +; CHECK-GI-LABEL: select_andor32: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: mov w8, #32 ; =0x20 +; CHECK-GI-NEXT: cmp w1, w2 +; CHECK-GI-NEXT: ccmp w0, w8, #4, lt +; CHECK-GI-NEXT: ccmp w0, w1, #0, eq +; CHECK-GI-NEXT: csel w0, w0, w1, eq +; CHECK-GI-NEXT: ret %c0 = icmp eq i32 %v1, %v2 %c1 = icmp sge i32 %v2, %v3 %c2 = icmp eq i32 %v1, 32 @@ -597,22 +597,22 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) { } define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) { -; SDISEL-LABEL: select_noccmp1: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmp x0, #0 -; SDISEL-NEXT: ccmp x0, #13, #4, lt -; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp x2, #2 -; SDISEL-NEXT: ccmp x2, #4, #4, lt -; SDISEL-NEXT: csinc w8, w8, wzr, le -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: csel x0, xzr, x3, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: select_noccmp1: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: ccmp x0, #13, #4, lt +; CHECK-SD-NEXT: cset w8, gt +; CHECK-SD-NEXT: cmp x2, #2 +; CHECK-SD-NEXT: ccmp x2, #4, #4, lt +; CHECK-SD-NEXT: csinc w8, w8, wzr, le +; CHECK-SD-NEXT: cmp w8, #0 +; CHECK-SD-NEXT: csel x0, xzr, x3, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: select_noccmp1: -; GISEL: ; %bb.0: -; GISEL-NEXT: mov x0, x3 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: select_noccmp1: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: mov x0, x3 +; CHECK-GI-NEXT: ret %c0 = icmp slt i64 %v1, 0 %c1 = icmp sgt i64 %v1, 13 %c2 = icmp slt i64 %v3, 2 @@ -627,28 +627,28 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) { @g = global i32 0 define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) { -; SDISEL-LABEL: select_noccmp2: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmp x0, #0 -; SDISEL-NEXT: ccmp x0, #13, #0, ge -; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: csel x0, xzr, x3, ne -; SDISEL-NEXT: sbfx w8, w8, #0, #1 -; SDISEL-NEXT: adrp x9, _g@PAGE -; SDISEL-NEXT: str w8, [x9, _g@PAGEOFF] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: select_noccmp2: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: ccmp x0, #13, #0, ge +; CHECK-SD-NEXT: cset w8, gt +; CHECK-SD-NEXT: cmp w8, #0 +; CHECK-SD-NEXT: csel x0, xzr, x3, ne +; CHECK-SD-NEXT: sbfx w8, w8, #0, #1 +; CHECK-SD-NEXT: adrp x9, _g@PAGE +; CHECK-SD-NEXT: str w8, [x9, _g@PAGEOFF] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: select_noccmp2: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmp x0, #14 -; GISEL-NEXT: cset w8, hs -; GISEL-NEXT: tst w8, #0x1 -; GISEL-NEXT: csel x0, xzr, x3, ne -; GISEL-NEXT: sbfx w8, w8, #0, #1 -; GISEL-NEXT: adrp x9, _g@PAGE -; GISEL-NEXT: str w8, [x9, _g@PAGEOFF] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: select_noccmp2: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmp x0, #14 +; CHECK-GI-NEXT: cset w8, hs +; CHECK-GI-NEXT: tst w8, #0x1 +; CHECK-GI-NEXT: csel x0, xzr, x3, ne +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: adrp x9, _g@PAGE +; CHECK-GI-NEXT: str w8, [x9, _g@PAGEOFF] +; CHECK-GI-NEXT: ret %c0 = icmp slt i64 %v1, 0 %c1 = icmp sgt i64 %v1, 13 %or = or i1 %c0, %c1 @@ -661,33 +661,33 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) { ; The following is not possible to implement with a single cmp;ccmp;csel ; sequence. define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) { -; SDISEL-LABEL: select_noccmp3: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmp w0, #0 -; SDISEL-NEXT: ccmp w0, #13, #0, ge -; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp w0, #22 -; SDISEL-NEXT: mov w9, #44 ; =0x2c -; SDISEL-NEXT: ccmp w0, w9, #0, ge -; SDISEL-NEXT: csel w8, wzr, w8, le -; SDISEL-NEXT: cmp w0, #99 -; SDISEL-NEXT: mov w9, #77 ; =0x4d -; SDISEL-NEXT: ccmp w0, w9, #4, ne -; SDISEL-NEXT: cset w9, eq -; SDISEL-NEXT: tst w8, w9 -; SDISEL-NEXT: csel w0, w1, w2, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: select_noccmp3: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ccmp w0, #13, #0, ge +; CHECK-SD-NEXT: cset w8, gt +; CHECK-SD-NEXT: cmp w0, #22 +; CHECK-SD-NEXT: mov w9, #44 ; =0x2c +; CHECK-SD-NEXT: ccmp w0, w9, #0, ge +; CHECK-SD-NEXT: csel w8, wzr, w8, le +; CHECK-SD-NEXT: cmp w0, #99 +; CHECK-SD-NEXT: mov w9, #77 ; =0x4d +; CHECK-SD-NEXT: ccmp w0, w9, #4, ne +; CHECK-SD-NEXT: cset w9, eq +; CHECK-SD-NEXT: tst w8, w9 +; CHECK-SD-NEXT: csel w0, w1, w2, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: select_noccmp3: -; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #99 ; =0x63 -; GISEL-NEXT: sub w9, w0, #45 -; GISEL-NEXT: cmp w0, #77 -; GISEL-NEXT: ccmp w0, w8, #4, ne -; GISEL-NEXT: ccmn w9, #23, #2, eq -; GISEL-NEXT: ccmp w0, #14, #0, lo -; GISEL-NEXT: csel w0, w1, w2, hs -; GISEL-NEXT: ret +; CHECK-GI-LABEL: select_noccmp3: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: mov w8, #99 ; =0x63 +; CHECK-GI-NEXT: sub w9, w0, #45 +; CHECK-GI-NEXT: cmp w0, #77 +; CHECK-GI-NEXT: ccmp w0, w8, #4, ne +; CHECK-GI-NEXT: ccmn w9, #23, #2, eq +; CHECK-GI-NEXT: ccmp w0, #14, #0, lo +; CHECK-GI-NEXT: csel w0, w1, w2, hs +; CHECK-GI-NEXT: ret %c0 = icmp slt i32 %v0, 0 %c1 = icmp sgt i32 %v0, 13 %c2 = icmp slt i32 %v0, 22 @@ -864,27 +864,27 @@ define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3 ; Verify that we correctly promote f16. define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 { -; SDISEL-LABEL: half_select_and_olt_oge: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: fcvt s1, h1 -; SDISEL-NEXT: fcvt s0, h0 -; SDISEL-NEXT: fcmp s0, s1 -; SDISEL-NEXT: fcvt s0, h3 -; SDISEL-NEXT: fcvt s1, h2 -; SDISEL-NEXT: fccmp s1, s0, #8, mi -; SDISEL-NEXT: csel w0, w0, w1, ge -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: half_select_and_olt_oge: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: fcmp s0, s1 +; CHECK-SD-NEXT: fcvt s0, h3 +; CHECK-SD-NEXT: fcvt s1, h2 +; CHECK-SD-NEXT: fccmp s1, s0, #8, mi +; CHECK-SD-NEXT: csel w0, w0, w1, ge +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: half_select_and_olt_oge: -; GISEL: ; %bb.0: -; GISEL-NEXT: fcvt s0, h0 -; GISEL-NEXT: fcvt s1, h1 -; GISEL-NEXT: fcvt s2, h2 -; GISEL-NEXT: fcvt s3, h3 -; GISEL-NEXT: fcmp s0, s1 -; GISEL-NEXT: fccmp s2, s3, #8, mi -; GISEL-NEXT: csel w0, w0, w1, ge -; GISEL-NEXT: ret +; CHECK-GI-LABEL: half_select_and_olt_oge: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: fcvt s0, h0 +; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: fcvt s2, h2 +; CHECK-GI-NEXT: fcvt s3, h3 +; CHECK-GI-NEXT: fcmp s0, s1 +; CHECK-GI-NEXT: fccmp s2, s3, #8, mi +; CHECK-GI-NEXT: csel w0, w0, w1, ge +; CHECK-GI-NEXT: ret %c0 = fcmp olt half %v0, %v1 %c1 = fcmp oge half %v2, %v3 %cr = and i1 %c1, %c0 @@ -893,29 +893,29 @@ define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32 } define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 { -; SDISEL-LABEL: half_select_and_olt_one: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: fcvt s1, h1 -; SDISEL-NEXT: fcvt s0, h0 -; SDISEL-NEXT: fcmp s0, s1 -; SDISEL-NEXT: fcvt s0, h3 -; SDISEL-NEXT: fcvt s1, h2 -; SDISEL-NEXT: fccmp s1, s0, #4, mi -; SDISEL-NEXT: fccmp s1, s0, #1, ne -; SDISEL-NEXT: csel w0, w0, w1, vc -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: half_select_and_olt_one: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: fcmp s0, s1 +; CHECK-SD-NEXT: fcvt s0, h3 +; CHECK-SD-NEXT: fcvt s1, h2 +; CHECK-SD-NEXT: fccmp s1, s0, #4, mi +; CHECK-SD-NEXT: fccmp s1, s0, #1, ne +; CHECK-SD-NEXT: csel w0, w0, w1, vc +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: half_select_and_olt_one: -; GISEL: ; %bb.0: -; GISEL-NEXT: fcvt s0, h0 -; GISEL-NEXT: fcvt s1, h1 -; GISEL-NEXT: fcvt s2, h2 -; GISEL-NEXT: fcvt s3, h3 -; GISEL-NEXT: fcmp s0, s1 -; GISEL-NEXT: fccmp s2, s3, #4, mi -; GISEL-NEXT: fccmp s2, s3, #1, ne -; GISEL-NEXT: csel w0, w0, w1, vc -; GISEL-NEXT: ret +; CHECK-GI-LABEL: half_select_and_olt_one: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: fcvt s0, h0 +; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: fcvt s2, h2 +; CHECK-GI-NEXT: fcvt s3, h3 +; CHECK-GI-NEXT: fcmp s0, s1 +; CHECK-GI-NEXT: fccmp s2, s3, #4, mi +; CHECK-GI-NEXT: fccmp s2, s3, #1, ne +; CHECK-GI-NEXT: csel w0, w0, w1, vc +; CHECK-GI-NEXT: ret %c0 = fcmp olt half %v0, %v1 %c1 = fcmp one half %v2, %v3 %cr = and i1 %c1, %c0 @@ -926,51 +926,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32 ; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead. define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 { -; SDISEL-LABEL: f128_select_and_olt_oge: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: sub sp, sp, #80 -; SDISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill -; SDISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill -; SDISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill -; SDISEL-NEXT: mov x19, x1 -; SDISEL-NEXT: mov x20, x0 -; SDISEL-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill -; SDISEL-NEXT: bl ___lttf2 -; SDISEL-NEXT: cmp w0, #0 -; SDISEL-NEXT: cset w21, lt -; SDISEL-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload -; SDISEL-NEXT: bl ___getf2 -; SDISEL-NEXT: cmp w0, #0 -; SDISEL-NEXT: cset w8, ge -; SDISEL-NEXT: tst w8, w21 -; SDISEL-NEXT: csel w0, w20, w19, ne -; SDISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload -; SDISEL-NEXT: add sp, sp, #80 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: f128_select_and_olt_oge: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #80 +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill +; CHECK-SD-NEXT: mov x19, x1 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill +; CHECK-SD-NEXT: bl ___lttf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: cset w21, lt +; CHECK-SD-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload +; CHECK-SD-NEXT: bl ___getf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: cset w8, ge +; CHECK-SD-NEXT: tst w8, w21 +; CHECK-SD-NEXT: csel w0, w20, w19, ne +; CHECK-SD-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #80 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: f128_select_and_olt_oge: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #80 -; GISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill -; GISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill -; GISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill -; GISEL-NEXT: stp q3, q2, [sp] ; 32-byte Folded Spill -; GISEL-NEXT: mov x19, x0 -; GISEL-NEXT: mov x20, x1 -; GISEL-NEXT: bl ___lttf2 -; GISEL-NEXT: mov x21, x0 -; GISEL-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload -; GISEL-NEXT: bl ___getf2 -; GISEL-NEXT: cmp w21, #0 -; GISEL-NEXT: ccmp w0, #0, #8, lt -; GISEL-NEXT: csel w0, w19, w20, ge -; GISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload -; GISEL-NEXT: add sp, sp, #80 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: f128_select_and_olt_oge: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp q3, q2, [sp] ; 32-byte Folded Spill +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x1 +; CHECK-GI-NEXT: bl ___lttf2 +; CHECK-GI-NEXT: mov x21, x0 +; CHECK-GI-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload +; CHECK-GI-NEXT: bl ___getf2 +; CHECK-GI-NEXT: cmp w21, #0 +; CHECK-GI-NEXT: ccmp w0, #0, #8, lt +; CHECK-GI-NEXT: csel w0, w19, w20, ge +; CHECK-GI-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #80 +; CHECK-GI-NEXT: ret %c0 = fcmp olt fp128 %v0, %v1 %c1 = fcmp oge fp128 %v2, %v3 %cr = and i1 %c1, %c0 @@ -1048,46 +1048,46 @@ define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { ; This test is trying to test that multiple ccmp's don't get created in a way ; that they would have multiple uses. It doesn't seem to. define i32 @multiccmp(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %x, i32 %y) #0 { -; SDISEL-LABEL: multiccmp: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill -; SDISEL-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill -; SDISEL-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill -; SDISEL-NEXT: mov x19, x5 -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: cset w20, gt -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: cset w21, ne -; SDISEL-NEXT: tst w20, w21 -; SDISEL-NEXT: csel w0, w5, w4, ne -; SDISEL-NEXT: bl _callee -; SDISEL-NEXT: tst w20, w21 -; SDISEL-NEXT: csel w0, w0, w19, ne -; SDISEL-NEXT: bl _callee -; SDISEL-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: multiccmp: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w20, gt +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: cset w21, ne +; CHECK-SD-NEXT: tst w20, w21 +; CHECK-SD-NEXT: csel w0, w5, w4, ne +; CHECK-SD-NEXT: bl _callee +; CHECK-SD-NEXT: tst w20, w21 +; CHECK-SD-NEXT: csel w0, w0, w19, ne +; CHECK-SD-NEXT: bl _callee +; CHECK-SD-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: multiccmp: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill -; GISEL-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill -; GISEL-NEXT: mov x19, x5 -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w20, w8, w9 -; GISEL-NEXT: tst w20, #0x1 -; GISEL-NEXT: csel w0, w5, w4, ne -; GISEL-NEXT: bl _callee -; GISEL-NEXT: tst w20, #0x1 -; GISEL-NEXT: csel w0, w0, w19, ne -; GISEL-NEXT: bl _callee -; GISEL-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload -; GISEL-NEXT: ret +; CHECK-GI-LABEL: multiccmp: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-GI-NEXT: mov x19, x5 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w20, w8, w9 +; CHECK-GI-NEXT: tst w20, #0x1 +; CHECK-GI-NEXT: csel w0, w5, w4, ne +; CHECK-GI-NEXT: bl _callee +; CHECK-GI-NEXT: tst w20, #0x1 +; CHECK-GI-NEXT: csel w0, w0, w19, ne +; CHECK-GI-NEXT: bl _callee +; CHECK-GI-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1100,57 +1100,57 @@ entry: } define i32 @multiccmp2(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %x, i32 %y) #0 { -; SDISEL-LABEL: multiccmp2: -; SDISEL: ; %bb.0: ; %entry -; SDISEL-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill -; SDISEL-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill -; SDISEL-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill -; SDISEL-NEXT: mov x19, x5 -; SDISEL-NEXT: mov x20, x3 -; SDISEL-NEXT: mov x21, x0 -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: cset w22, ne -; SDISEL-NEXT: tst w8, w22 -; SDISEL-NEXT: csel w0, w5, w4, ne -; SDISEL-NEXT: bl _callee -; SDISEL-NEXT: cmp w21, w20 -; SDISEL-NEXT: cset w8, eq -; SDISEL-NEXT: tst w22, w8 -; SDISEL-NEXT: csel w0, w0, w19, ne -; SDISEL-NEXT: bl _callee -; SDISEL-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload -; SDISEL-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: multiccmp2: +; CHECK-SD: ; %bb.0: ; %entry +; CHECK-SD-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill +; CHECK-SD-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w8, gt +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: cset w22, ne +; CHECK-SD-NEXT: tst w8, w22 +; CHECK-SD-NEXT: csel w0, w5, w4, ne +; CHECK-SD-NEXT: bl _callee +; CHECK-SD-NEXT: cmp w21, w20 +; CHECK-SD-NEXT: cset w8, eq +; CHECK-SD-NEXT: tst w22, w8 +; CHECK-SD-NEXT: csel w0, w0, w19, ne +; CHECK-SD-NEXT: bl _callee +; CHECK-SD-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: multiccmp2: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill -; GISEL-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill -; GISEL-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill -; GISEL-NEXT: mov x19, x0 -; GISEL-NEXT: mov x20, x3 -; GISEL-NEXT: mov x21, x5 -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w22, ne -; GISEL-NEXT: and w8, w8, w22 -; GISEL-NEXT: tst w8, #0x1 -; GISEL-NEXT: csel w0, w5, w4, ne -; GISEL-NEXT: bl _callee -; GISEL-NEXT: cmp w19, w20 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: and w8, w22, w8 -; GISEL-NEXT: tst w8, #0x1 -; GISEL-NEXT: csel w0, w0, w21, ne -; GISEL-NEXT: bl _callee -; GISEL-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload -; GISEL-NEXT: ret +; CHECK-GI-LABEL: multiccmp2: +; CHECK-GI: ; %bb.0: ; %entry +; CHECK-GI-NEXT: stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] ; 16-byte Folded Spill +; CHECK-GI-NEXT: stp x29, x30, [sp, #32] ; 16-byte Folded Spill +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x5 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w22, ne +; CHECK-GI-NEXT: and w8, w8, w22 +; CHECK-GI-NEXT: tst w8, #0x1 +; CHECK-GI-NEXT: csel w0, w5, w4, ne +; CHECK-GI-NEXT: bl _callee +; CHECK-GI-NEXT: cmp w19, w20 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: and w8, w22, w8 +; CHECK-GI-NEXT: tst w8, #0x1 +; CHECK-GI-NEXT: csel w0, w0, w21, ne +; CHECK-GI-NEXT: bl _callee +; CHECK-GI-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %c0 = icmp sgt i32 %s0, %s1 %c1 = icmp ne i32 %s2, %s3 @@ -1168,21 +1168,21 @@ entry: declare i32 @callee(i32) define i1 @cmp_and_negative_const(i32 %0, i32 %1) { -; SDISEL-LABEL: cmp_and_negative_const: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmn w0, #1 -; SDISEL-NEXT: ccmn w1, #2, #0, eq -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_and_negative_const: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmn w0, #1 +; CHECK-SD-NEXT: ccmn w1, #2, #0, eq +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_and_negative_const: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmn w0, #1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmn w1, #2 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_and_negative_const: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmn w0, #1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmn w1, #2 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret %3 = icmp eq i32 %0, -1 %4 = icmp eq i32 %1, -2 %5 = and i1 %3, %4 @@ -1190,21 +1190,21 @@ define i1 @cmp_and_negative_const(i32 %0, i32 %1) { } define i1 @cmp_or_negative_const(i32 %a, i32 %b) { -; SDISEL-LABEL: cmp_or_negative_const: -; SDISEL: ; %bb.0: -; SDISEL-NEXT: cmn w0, #1 -; SDISEL-NEXT: ccmn w1, #2, #4, ne -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_or_negative_const: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmn w0, #1 +; CHECK-SD-NEXT: ccmn w1, #2, #4, ne +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_or_negative_const: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmn w0, #1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: cmn w1, #2 -; GISEL-NEXT: cset w9, eq -; GISEL-NEXT: orr w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_or_negative_const: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmn w0, #1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: cmn w1, #2 +; CHECK-GI-NEXT: cset w9, eq +; CHECK-GI-NEXT: orr w0, w8, w9 +; CHECK-GI-NEXT: ret %cmp = icmp eq i32 %a, -1 %cmp1 = icmp eq i32 %b, -2 %or.cond = or i1 %cmp, %cmp1 diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll index 4b816df..3620444 100644 --- a/llvm/test/CodeGen/AArch64/cmp-chains.ll +++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll @@ -1,26 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc < %s -mtriple=aarch64-- -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64-- -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; Ensure chains of comparisons produce chains of `ccmp` ; (x0 < x1) && (x2 > x3) define i32 @cmp_and2(i32 %0, i32 %1, i32 %2, i32 %3) { -; SDISEL-LABEL: cmp_and2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: cset w0, hi -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_and2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_and2: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_and2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret %5 = icmp ult i32 %0, %1 %6 = icmp ugt i32 %2, %3 %7 = select i1 %5, i1 %6, i1 false @@ -30,25 +30,25 @@ define i32 @cmp_and2(i32 %0, i32 %1, i32 %2, i32 %3) { ; (x0 < x1) && (x2 > x3) && (x4 != x5) define i32 @cmp_and3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { -; SDISEL-LABEL: cmp_and3: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, lo -; SDISEL-NEXT: ccmp w4, w5, #4, hi -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_and3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, lo +; CHECK-SD-NEXT: ccmp w4, w5, #4, hi +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_and3: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_and3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret %7 = icmp ult i32 %0, %1 %8 = icmp ugt i32 %2, %3 %9 = select i1 %7, i1 %8, i1 false @@ -60,29 +60,29 @@ define i32 @cmp_and3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { ; (x0 < x1) && (x2 > x3) && (x4 != x5) && (x6 == x7) define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) { -; SDISEL-LABEL: cmp_and4: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w2, w3 -; SDISEL-NEXT: ccmp w0, w1, #2, hi -; SDISEL-NEXT: ccmp w4, w5, #4, lo -; SDISEL-NEXT: ccmp w6, w7, #0, ne -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_and4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w2, w3 +; CHECK-SD-NEXT: ccmp w0, w1, #2, hi +; CHECK-SD-NEXT: ccmp w4, w5, #4, lo +; CHECK-SD-NEXT: ccmp w6, w7, #0, ne +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_and4: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w8, hi -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: cset w10, ne -; GISEL-NEXT: cmp w6, w7 -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: cset w11, eq -; GISEL-NEXT: and w9, w10, w11 -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_and4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: cset w10, ne +; CHECK-GI-NEXT: cmp w6, w7 +; CHECK-GI-NEXT: and w8, w8, w9 +; CHECK-GI-NEXT: cset w11, eq +; CHECK-GI-NEXT: and w9, w10, w11 +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret %9 = icmp ugt i32 %2, %3 %10 = icmp ult i32 %0, %1 %11 = select i1 %9, i1 %10, i1 false @@ -96,22 +96,22 @@ define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 ; (x0 < x1) || (x2 > x3) define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) { -; SDISEL-LABEL: cmp_or2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #0, hs -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_or2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #0, hs +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_or2: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: and w0, w8, #0x1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_or2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret %5 = icmp ult i32 %0, %1 %6 = icmp ne i32 %2, %3 %7 = select i1 %5, i1 true, i1 %6 @@ -121,26 +121,26 @@ define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) { ; (x0 < x1) || (x2 > x3) || (x4 != x5) define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { -; SDISEL-LABEL: cmp_or3: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hs -; SDISEL-NEXT: ccmp w4, w5, #0, ls -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_or3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hs +; CHECK-SD-NEXT: ccmp w4, w5, #0, ls +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_or3: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: and w0, w8, #0x1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_or3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret %7 = icmp ult i32 %0, %1 %8 = icmp ugt i32 %2, %3 %9 = select i1 %7, i1 true, i1 %8 @@ -152,30 +152,30 @@ define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { ; (x0 < x1) || (x2 > x3) || (x4 != x5) || (x6 == x7) define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) { -; SDISEL-LABEL: cmp_or4: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w1 -; SDISEL-NEXT: ccmp w2, w3, #2, hs -; SDISEL-NEXT: ccmp w4, w5, #0, ls -; SDISEL-NEXT: ccmp w6, w7, #4, eq -; SDISEL-NEXT: cset w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: cmp_or4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: ccmp w2, w3, #2, hs +; CHECK-SD-NEXT: ccmp w4, w5, #0, ls +; CHECK-SD-NEXT: ccmp w6, w7, #4, eq +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: cmp_or4: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w1 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w3 -; GISEL-NEXT: cset w9, hi -; GISEL-NEXT: cmp w4, w5 -; GISEL-NEXT: cset w10, ne -; GISEL-NEXT: cmp w6, w7 -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: cset w11, eq -; GISEL-NEXT: orr w9, w10, w11 -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: and w0, w8, #0x1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: cmp_or4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w3 +; CHECK-GI-NEXT: cset w9, hi +; CHECK-GI-NEXT: cmp w4, w5 +; CHECK-GI-NEXT: cset w10, ne +; CHECK-GI-NEXT: cmp w6, w7 +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: cset w11, eq +; CHECK-GI-NEXT: orr w9, w10, w11 +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret %9 = icmp ult i32 %0, %1 %10 = icmp ugt i32 %2, %3 %11 = select i1 %9, i1 true, i1 %10 @@ -189,22 +189,22 @@ define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 ; (x0 != 0) || (x1 != 0) define i32 @true_or2(i32 %0, i32 %1) { -; SDISEL-LABEL: true_or2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w0, w1 -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: true_or2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w0, w1 +; CHECK-SD-NEXT: cmp w8, #0 +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: true_or2: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #0 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w1, #0 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: and w0, w8, #0x1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: true_or2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w1, #0 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret %3 = icmp ne i32 %0, 0 %4 = icmp ne i32 %1, 0 %5 = select i1 %3, i1 true, i1 %4 @@ -214,26 +214,26 @@ define i32 @true_or2(i32 %0, i32 %1) { ; (x0 != 0) || (x1 != 0) || (x2 != 0) define i32 @true_or3(i32 %0, i32 %1, i32 %2) { -; SDISEL-LABEL: true_or3: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w0, w1 -; SDISEL-NEXT: orr w8, w8, w2 -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: cset w0, ne -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: true_or3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w0, w1 +; CHECK-SD-NEXT: orr w8, w8, w2 +; CHECK-SD-NEXT: cmp w8, #0 +; CHECK-SD-NEXT: cset w0, ne +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: true_or3: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #0 -; GISEL-NEXT: cset w8, ne -; GISEL-NEXT: cmp w1, #0 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: cmp w2, #0 -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: cset w9, ne -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: and w0, w8, #0x1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: true_or3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: cmp w1, #0 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: cmp w2, #0 +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: cset w9, ne +; CHECK-GI-NEXT: orr w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret %4 = icmp ne i32 %0, 0 %5 = icmp ne i32 %1, 0 %6 = select i1 %4, i1 true, i1 %5 @@ -260,22 +260,22 @@ define i32 @neg_range_int(i32 %a, i32 %b, i32 %c) { ; (b > -(d | 1) && a < c) define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #4, lt -; SDISEL-NEXT: csel w0, w1, w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #4, lt +; CHECK-SD-NEXT: csel w0, w1, w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #4, lt -; GISEL-NEXT: csel w0, w1, w0, gt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #4, lt +; CHECK-GI-NEXT: csel w0, w1, w0, gt +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp sgt i32 %b, %negd @@ -287,22 +287,22 @@ define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b >u -(d | 1) && a < c) define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp_u: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #0, lt -; SDISEL-NEXT: csel w0, w1, w0, hi -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp_u: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #0, lt +; CHECK-SD-NEXT: csel w0, w1, w0, hi +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp_u: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #0, lt -; GISEL-NEXT: csel w0, w1, w0, hi -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp_u: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #0, lt +; CHECK-GI-NEXT: csel w0, w1, w0, hi +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp ugt i32 %b, %negd @@ -314,22 +314,22 @@ define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b > -(d | 1) && a u < c) define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp_ua: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #4, lo -; SDISEL-NEXT: csel w0, w1, w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp_ua: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #4, lo +; CHECK-SD-NEXT: csel w0, w1, w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp_ua: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #4, lo -; GISEL-NEXT: csel w0, w1, w0, gt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp_ua: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #4, lo +; CHECK-GI-NEXT: csel w0, w1, w0, gt +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp sgt i32 %b, %negd @@ -341,19 +341,19 @@ define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b <= -3 && a > c) define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) { -; SDISEL-LABEL: neg_range_int_2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, #4, #4, gt -; SDISEL-NEXT: csel w0, w1, w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, #4, #4, gt +; CHECK-SD-NEXT: csel w0, w1, w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_2: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: ccmn w1, #3, #8, gt -; GISEL-NEXT: csel w0, w1, w0, ge -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: ccmn w1, #3, #8, gt +; CHECK-GI-NEXT: csel w0, w1, w0, ge +; CHECK-GI-NEXT: ret %cmp = icmp sge i32 %b, -3 %cmp1 = icmp sgt i32 %a, %c %or.cond = and i1 %cmp, %cmp1 @@ -363,22 +363,22 @@ define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) { ; (b < -(d | 1) && a >= c) define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #0, ge -; SDISEL-NEXT: csel w0, w1, w0, lt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #0, ge +; CHECK-SD-NEXT: csel w0, w1, w0, lt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp2: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #0, ge -; GISEL-NEXT: csel w0, w1, w0, lt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #0, ge +; CHECK-GI-NEXT: csel w0, w1, w0, lt +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp slt i32 %b, %negd @@ -390,22 +390,22 @@ define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b <u -(d | 1) && a > c) define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp_u2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #2, gt -; SDISEL-NEXT: csel w0, w1, w0, lo -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp_u2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #2, gt +; CHECK-SD-NEXT: csel w0, w1, w0, lo +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp_u2: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #2, gt -; GISEL-NEXT: csel w0, w1, w0, lo -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp_u2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #2, gt +; CHECK-GI-NEXT: csel w0, w1, w0, lo +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp ult i32 %b, %negd @@ -417,22 +417,22 @@ define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b > -(d | 1) && a u > c) define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp_ua2: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #4, hi -; SDISEL-NEXT: csel w0, w1, w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp_ua2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #4, hi +; CHECK-SD-NEXT: csel w0, w1, w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp_ua2: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #4, hi -; GISEL-NEXT: csel w0, w1, w0, gt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp_ua2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #4, hi +; CHECK-GI-NEXT: csel w0, w1, w0, gt +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp sgt i32 %b, %negd @@ -444,22 +444,22 @@ define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) { ; (b > -(d | 1) && a u == c) define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) { -; SDISEL-LABEL: neg_range_int_comp_ua3: -; SDISEL: // %bb.0: -; SDISEL-NEXT: orr w8, w3, #0x1 -; SDISEL-NEXT: cmp w0, w2 -; SDISEL-NEXT: ccmn w1, w8, #4, eq -; SDISEL-NEXT: csel w0, w1, w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_comp_ua3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: orr w8, w3, #0x1 +; CHECK-SD-NEXT: cmp w0, w2 +; CHECK-SD-NEXT: ccmn w1, w8, #4, eq +; CHECK-SD-NEXT: csel w0, w1, w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_comp_ua3: -; GISEL: // %bb.0: -; GISEL-NEXT: orr w8, w3, #0x1 -; GISEL-NEXT: cmp w0, w2 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: ccmp w1, w8, #4, eq -; GISEL-NEXT: csel w0, w1, w0, gt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_comp_ua3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: orr w8, w3, #0x1 +; CHECK-GI-NEXT: cmp w0, w2 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: ccmp w1, w8, #4, eq +; CHECK-GI-NEXT: csel w0, w1, w0, gt +; CHECK-GI-NEXT: ret %dor = or i32 %d, 1 %negd = sub i32 0, %dor %cmp = icmp sgt i32 %b, %negd @@ -471,26 +471,26 @@ define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) { ; -(a | 1) > (b | 3) && a < c define i32 @neg_range_int_c(i32 %a, i32 %b, i32 %c) { -; SDISEL-LABEL: neg_range_int_c: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: orr w8, w0, #0x1 -; SDISEL-NEXT: orr w9, w1, #0x3 -; SDISEL-NEXT: cmn w9, w8 -; SDISEL-NEXT: ccmp w2, w0, #2, lo -; SDISEL-NEXT: cset w0, lo -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: neg_range_int_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr w8, w0, #0x1 +; CHECK-SD-NEXT: orr w9, w1, #0x3 +; CHECK-SD-NEXT: cmn w9, w8 +; CHECK-SD-NEXT: ccmp w2, w0, #2, lo +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: neg_range_int_c: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: orr w8, w0, #0x1 -; GISEL-NEXT: orr w9, w1, #0x3 -; GISEL-NEXT: neg w8, w8 -; GISEL-NEXT: cmp w9, w8 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: cmp w2, w0 -; GISEL-NEXT: cset w9, lo -; GISEL-NEXT: and w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: neg_range_int_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr w8, w0, #0x1 +; CHECK-GI-NEXT: orr w9, w1, #0x3 +; CHECK-GI-NEXT: neg w8, w8 +; CHECK-GI-NEXT: cmp w9, w8 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: cmp w2, w0 +; CHECK-GI-NEXT: cset w9, lo +; CHECK-GI-NEXT: and w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %or = or i32 %a, 1 %sub = sub i32 0, %or diff --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll index 56208f1..02b0077 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll @@ -1,26 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple arm64-none-eabi -o - %s | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc -mtriple arm64-none-eabi -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -mtriple arm64-none-eabi -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple arm64-none-eabi -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI @out = internal global i32 0, align 4 ; Ensure that we transform select(C0, x, select(C1, x, y)) towards ; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation. define i32 @test0(i32 %v0, i32 %v1, i32 %v2) { -; SDISEL-LABEL: test0: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: ccmp w1, #0, #0, ne -; SDISEL-NEXT: csel w0, w1, w2, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, #7 +; CHECK-SD-NEXT: ccmp w1, #0, #0, ne +; CHECK-SD-NEXT: csel w0, w1, w2, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test0: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #7 -; GISEL-NEXT: csel w8, w1, w2, eq -; GISEL-NEXT: cmp w1, #0 -; GISEL-NEXT: csel w0, w1, w8, gt -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, #7 +; CHECK-GI-NEXT: csel w8, w1, w2, eq +; CHECK-GI-NEXT: cmp w1, #0 +; CHECK-GI-NEXT: csel w0, w1, w8, gt +; CHECK-GI-NEXT: ret %cmp1 = icmp eq i32 %v0, 7 %cmp2 = icmp sgt i32 %v1, 0 %sel0 = select i1 %cmp1, i32 %v1, i32 %v2 @@ -32,36 +32,36 @@ define i32 @test0(i32 %v0, i32 %v1, i32 %v2) { ; sequences. This case should be transformed to select(C0, select(C1, x, y), y) ; anyway to get CSE effects. define void @test1(i32 %bitset, i32 %val0, i32 %val1) { -; SDISEL-LABEL: test1: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: adrp x9, out -; SDISEL-NEXT: csel w8, w1, w2, eq -; SDISEL-NEXT: cmp w8, #13 -; SDISEL-NEXT: csel w8, w1, w2, lo -; SDISEL-NEXT: cmp w0, #42 -; SDISEL-NEXT: csel w10, w1, w8, eq -; SDISEL-NEXT: str w8, [x9, :lo12:out] -; SDISEL-NEXT: str w10, [x9, :lo12:out] -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, #7 +; CHECK-SD-NEXT: adrp x9, out +; CHECK-SD-NEXT: csel w8, w1, w2, eq +; CHECK-SD-NEXT: cmp w8, #13 +; CHECK-SD-NEXT: csel w8, w1, w2, lo +; CHECK-SD-NEXT: cmp w0, #42 +; CHECK-SD-NEXT: csel w10, w1, w8, eq +; CHECK-SD-NEXT: str w8, [x9, :lo12:out] +; CHECK-SD-NEXT: str w10, [x9, :lo12:out] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test1: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #7 -; GISEL-NEXT: csel w8, w1, w2, eq -; GISEL-NEXT: cmp w8, #13 -; GISEL-NEXT: cset w8, lo -; GISEL-NEXT: tst w8, #0x1 -; GISEL-NEXT: csel w9, w1, w2, ne -; GISEL-NEXT: cmp w0, #42 -; GISEL-NEXT: cset w10, eq -; GISEL-NEXT: orr w8, w10, w8 -; GISEL-NEXT: tst w8, #0x1 -; GISEL-NEXT: adrp x8, out -; GISEL-NEXT: csel w10, w1, w2, ne -; GISEL-NEXT: str w9, [x8, :lo12:out] -; GISEL-NEXT: str w10, [x8, :lo12:out] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, #7 +; CHECK-GI-NEXT: csel w8, w1, w2, eq +; CHECK-GI-NEXT: cmp w8, #13 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: tst w8, #0x1 +; CHECK-GI-NEXT: csel w9, w1, w2, ne +; CHECK-GI-NEXT: cmp w0, #42 +; CHECK-GI-NEXT: cset w10, eq +; CHECK-GI-NEXT: orr w8, w10, w8 +; CHECK-GI-NEXT: tst w8, #0x1 +; CHECK-GI-NEXT: adrp x8, out +; CHECK-GI-NEXT: csel w10, w1, w2, ne +; CHECK-GI-NEXT: str w9, [x8, :lo12:out] +; CHECK-GI-NEXT: str w10, [x8, :lo12:out] +; CHECK-GI-NEXT: ret %cmp1 = icmp eq i32 %bitset, 7 %cond = select i1 %cmp1, i32 %val0, i32 %val1 %cmp5 = icmp ult i32 %cond, 13 diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll index 1b98954..b056460 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half) declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half) @@ -27,18 +27,18 @@ declare half @llvm.aarch64.neon.frecpx.f16(half) declare half @llvm.aarch64.neon.frecpe.f16(half) define dso_local i16 @t2(half %a) { -; SDISEL-LABEL: t2: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, #0.0 -; SDISEL-NEXT: csetm w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, #0.0 +; CHECK-SD-NEXT: csetm w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t2: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, #0.0 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, #0.0 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp oeq half %a, 0xH0000 %vceqz = sext i1 %0 to i16 @@ -46,18 +46,18 @@ entry: } define dso_local i16 @t3(half %a) { -; SDISEL-LABEL: t3: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, #0.0 -; SDISEL-NEXT: csetm w0, ge -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t3: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, #0.0 +; CHECK-SD-NEXT: csetm w0, ge +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t3: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, #0.0 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t3: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, #0.0 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp oge half %a, 0xH0000 %vcgez = sext i1 %0 to i16 @@ -65,18 +65,18 @@ entry: } define dso_local i16 @t4(half %a) { -; SDISEL-LABEL: t4: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, #0.0 -; SDISEL-NEXT: csetm w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t4: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, #0.0 +; CHECK-SD-NEXT: csetm w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t4: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, #0.0 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t4: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, #0.0 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp ogt half %a, 0xH0000 %vcgtz = sext i1 %0 to i16 @@ -84,18 +84,18 @@ entry: } define dso_local i16 @t5(half %a) { -; SDISEL-LABEL: t5: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, #0.0 -; SDISEL-NEXT: csetm w0, ls -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t5: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, #0.0 +; CHECK-SD-NEXT: csetm w0, ls +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t5: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, #0.0 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t5: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, #0.0 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp ole half %a, 0xH0000 %vclez = sext i1 %0 to i16 @@ -103,18 +103,18 @@ entry: } define dso_local i16 @t6(half %a) { -; SDISEL-LABEL: t6: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, #0.0 -; SDISEL-NEXT: csetm w0, mi -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t6: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, #0.0 +; CHECK-SD-NEXT: csetm w0, mi +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t6: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, #0.0 -; GISEL-NEXT: cset w8, mi -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t6: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, #0.0 +; CHECK-GI-NEXT: cset w8, mi +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp olt half %a, 0xH0000 %vcltz = sext i1 %0 to i16 @@ -172,15 +172,15 @@ entry: } define dso_local i16 @t16(half %a) { -; SDISEL-LABEL: t16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcvtzs w0, h0 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtzs w0, h0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcvtzu w0, h0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcvtzu w0, h0 +; CHECK-GI-NEXT: ret entry: %0 = fptoui half %a to i16 ret i16 %0 diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll index 5b08ef2..da70599 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,SDISEL -; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare half @llvm.aarch64.sisd.fabd.f16(half, half) @@ -35,18 +35,18 @@ entry: } define dso_local i16 @t_vceqh_f16(half %a, half %b) { -; SDISEL-LABEL: t_vceqh_f16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, h1 -; SDISEL-NEXT: csetm w0, eq -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t_vceqh_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, h1 +; CHECK-SD-NEXT: csetm w0, eq +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t_vceqh_f16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, h1 -; GISEL-NEXT: cset w8, eq -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t_vceqh_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, h1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp oeq half %a, %b %vcmpd = sext i1 %0 to i16 @@ -54,18 +54,18 @@ entry: } define dso_local i16 @t_vcgeh_f16(half %a, half %b) { -; SDISEL-LABEL: t_vcgeh_f16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, h1 -; SDISEL-NEXT: csetm w0, ge -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t_vcgeh_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, h1 +; CHECK-SD-NEXT: csetm w0, ge +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t_vcgeh_f16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, h1 -; GISEL-NEXT: cset w8, ge -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t_vcgeh_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, h1 +; CHECK-GI-NEXT: cset w8, ge +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp oge half %a, %b %vcmpd = sext i1 %0 to i16 @@ -73,18 +73,18 @@ entry: } define dso_local i16 @t_vcgth_f16(half %a, half %b) { -; SDISEL-LABEL: t_vcgth_f16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, h1 -; SDISEL-NEXT: csetm w0, gt -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t_vcgth_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, h1 +; CHECK-SD-NEXT: csetm w0, gt +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t_vcgth_f16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, h1 -; GISEL-NEXT: cset w8, gt -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t_vcgth_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, h1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp ogt half %a, %b %vcmpd = sext i1 %0 to i16 @@ -92,18 +92,18 @@ entry: } define dso_local i16 @t_vcleh_f16(half %a, half %b) { -; SDISEL-LABEL: t_vcleh_f16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, h1 -; SDISEL-NEXT: csetm w0, ls -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t_vcleh_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, h1 +; CHECK-SD-NEXT: csetm w0, ls +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t_vcleh_f16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, h1 -; GISEL-NEXT: cset w8, ls -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t_vcleh_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, h1 +; CHECK-GI-NEXT: cset w8, ls +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp ole half %a, %b %vcmpd = sext i1 %0 to i16 @@ -111,18 +111,18 @@ entry: } define dso_local i16 @t_vclth_f16(half %a, half %b) { -; SDISEL-LABEL: t_vclth_f16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fcmp h0, h1 -; SDISEL-NEXT: csetm w0, mi -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: t_vclth_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp h0, h1 +; CHECK-SD-NEXT: csetm w0, mi +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: t_vclth_f16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: fcmp h0, h1 -; GISEL-NEXT: cset w8, mi -; GISEL-NEXT: sbfx w0, w8, #0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: t_vclth_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp h0, h1 +; CHECK-GI-NEXT: cset w8, mi +; CHECK-GI-NEXT: sbfx w0, w8, #0, #1 +; CHECK-GI-NEXT: ret entry: %0 = fcmp olt half %a, %b %vcmpd = sext i1 %0 to i16 @@ -187,18 +187,18 @@ declare half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32, i32) #1 declare i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half, i32) #1 define dso_local half @test_vcvth_n_f16_s16_1(i16 %a) { -; SDISEL-LABEL: test_vcvth_n_f16_s16_1: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fmov s0, w0 -; SDISEL-NEXT: scvtf h0, h0, #1 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test_vcvth_n_f16_s16_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: scvtf h0, h0, #1 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_vcvth_n_f16_s16_1: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: sxth w8, w0 -; GISEL-NEXT: fmov s0, w8 -; GISEL-NEXT: scvtf h0, h0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvth_n_f16_s16_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: scvtf h0, h0, #1 +; CHECK-GI-NEXT: ret entry: %sext = sext i16 %a to i32 %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 1) @@ -206,18 +206,18 @@ entry: } define dso_local half @test_vcvth_n_f16_s16_16(i16 %a) { -; SDISEL-LABEL: test_vcvth_n_f16_s16_16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fmov s0, w0 -; SDISEL-NEXT: scvtf h0, h0, #16 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test_vcvth_n_f16_s16_16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: scvtf h0, h0, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_vcvth_n_f16_s16_16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: sxth w8, w0 -; GISEL-NEXT: fmov s0, w8 -; GISEL-NEXT: scvtf h0, h0, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvth_n_f16_s16_16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: scvtf h0, h0, #16 +; CHECK-GI-NEXT: ret entry: %sext = sext i16 %a to i32 %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 16) @@ -315,18 +315,18 @@ entry: } define dso_local half @test_vcvth_n_f16_u16_1(i16 %a) { -; SDISEL-LABEL: test_vcvth_n_f16_u16_1: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fmov s0, w0 -; SDISEL-NEXT: ucvtf h0, h0, #1 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test_vcvth_n_f16_u16_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: ucvtf h0, h0, #1 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_vcvth_n_f16_u16_1: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: and w8, w0, #0xffff -; GISEL-NEXT: fmov s0, w8 -; GISEL-NEXT: ucvtf h0, h0, #1 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvth_n_f16_u16_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: ucvtf h0, h0, #1 +; CHECK-GI-NEXT: ret entry: %0 = zext i16 %a to i32 %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 1) @@ -334,18 +334,18 @@ entry: } define dso_local half @test_vcvth_n_f16_u16_16(i16 %a) { -; SDISEL-LABEL: test_vcvth_n_f16_u16_16: -; SDISEL: // %bb.0: // %entry -; SDISEL-NEXT: fmov s0, w0 -; SDISEL-NEXT: ucvtf h0, h0, #16 -; SDISEL-NEXT: ret +; CHECK-SD-LABEL: test_vcvth_n_f16_u16_16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: ucvtf h0, h0, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_vcvth_n_f16_u16_16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: and w8, w0, #0xffff -; GISEL-NEXT: fmov s0, w8 -; GISEL-NEXT: ucvtf h0, h0, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvth_n_f16_u16_16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: ucvtf h0, h0, #16 +; CHECK-GI-NEXT: ret entry: %0 = zext i16 %a to i32 %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 16) diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll new file mode 100644 index 0000000..c4a027c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s + +target triple = "arm64-apple-macosx13.0.0" + +@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)] + +define void @test_interp(ptr %frame, ptr %dst) { +; CHECK-LABEL: test_interp: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset w19, -24 +; CHECK-NEXT: .cfi_offset w20, -32 +; CHECK-NEXT: .cfi_offset w21, -40 +; CHECK-NEXT: .cfi_offset w22, -48 +; CHECK-NEXT: .cfi_offset w23, -56 +; CHECK-NEXT: .cfi_offset w24, -64 +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x21, _opcode.targets@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: add x8, x21, xzr, lsl #3 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x23, x22, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp0: ; Block address taken +; CHECK-NEXT: LBB0_1: ; %loop.header +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: mov x20, xzr +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp1: ; Block address taken +; CHECK-NEXT: LBB0_2: ; %op1.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str xzr, [x19] +; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: ldr x0, [x20, #-8]! +; CHECK-NEXT: ldr x9, [x0, #8] +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldr x8, [x9, #48] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp2: ; Block address taken +; CHECK-NEXT: LBB0_3: ; %op2.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: mov x20, xzr +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp3: ; Block address taken +; CHECK-NEXT: LBB0_4: ; %op4.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: add x10, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: ldur x8, [x22, #12] +; CHECK-NEXT: ldur x9, [x20, #-8] +; CHECK-NEXT: add x22, x22, #20 +; CHECK-NEXT: stp x8, x9, [x20, #-8] +; CHECK-NEXT: add x20, x20, #8 +; CHECK-NEXT: br x10 +; CHECK-NEXT: Ltmp4: ; Block address taken +; CHECK-NEXT: LBB0_5: ; %op5.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: add x10, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: ldur x8, [x22, #12] +; CHECK-NEXT: ldur x9, [x20, #-8] +; CHECK-NEXT: add x22, x22, #20 +; CHECK-NEXT: stp x8, x9, [x20, #-8] +; CHECK-NEXT: add x20, x20, #8 +; CHECK-NEXT: br x10 +; CHECK-NEXT: Ltmp5: ; Block address taken +; CHECK-NEXT: LBB0_6: ; %op6.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x0, [x20, #-8]! +; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: ldr x9, [x0, #8] +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldr x8, [x9, #48] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1 +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ] + %stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ] + %next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ] + %iv.next = add i64 %iv, 1 + %next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv + indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb] + +op1.bb: + store ptr null, ptr %dst, align 8 + %stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.0 = load ptr, ptr %stack.8, align 8 + store i64 1, ptr %l.0, align 8 + %gep.0 = getelementptr i8, ptr %l.0, i64 8 + %l.1 = load ptr, ptr %gep.0, align 8 + %gep.1 = getelementptr i8, ptr %l.1, i64 48 + %l.2 = load ptr, ptr %gep.1, align 8 + tail call void %l.2(ptr nonnull %l.0) + br label %loop.header + +op2.bb: + store ptr %next.instr, ptr %dst, align 8 + br label %loop.header + +op4.bb: + store ptr %next.instr, ptr %dst, align 8 + %next.instr.20 = getelementptr i8, ptr %next.instr, i64 20 + %stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.3 = load ptr, ptr %stack.2, align 8 + %next.instr.12 = getelementptr i8, ptr %next.instr, i64 12 + %next.instr.12.val = load ptr, ptr %next.instr.12, align 2 + store ptr %next.instr.12.val, ptr %stack.2, align 8 + store ptr %l.3, ptr %stack.pointer, align 8 + %stack.next = getelementptr i8, ptr %stack.pointer, i64 8 + br label %loop.header + +op5.bb: + store ptr %next.instr, ptr %dst, align 8 + %next.instr.21 = getelementptr i8, ptr %next.instr, i64 20 + %stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.4 = load ptr, ptr %stack.3, align 8 + %next.instr.2 = getelementptr i8, ptr %next.instr, i64 12 + %next.instr.2.val = load ptr, ptr %next.instr.2, align 2 + store ptr %next.instr.2.val, ptr %stack.3, align 8 + store ptr %l.4, ptr %stack.pointer, align 8 + %stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8 + br label %loop.header + +op6.bb: + %stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.5 = load ptr, ptr %stack.4, align 8 + store i64 1, ptr %l.5, align 8 + %gep.5 = getelementptr i8, ptr %l.5, i64 8 + %l.6 = load ptr, ptr %gep.5, align 8 + %gep.6 = getelementptr i8, ptr %l.6, i64 48 + %l.7 = load ptr, ptr %gep.6, align 8 + tail call void %l.7(ptr nonnull %l.5) + br label %loop.header +} diff --git a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll index 05f4fb1..a6cb712 100644 --- a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll +++ b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll @@ -40,3 +40,10 @@ define void @test_optsize() optsize { ; CHECK-LABEL: test_optsize ; CHECK-NEXT: .p2align 2 + +define void @test_minsize() minsize { + ret void +} + +; CHECK-LABEL: test_minsize +; CHECK-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll new file mode 100644 index 0000000..d1e82a0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s + +; Test that stores that may hit scratch are correctly promoted to SCOPE_SE. + +define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) { +; GCN-LABEL: test_scratch_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr addrspace(5) %ptr + ret void +} + +define void @test_unknown_flat_store(ptr %ptr, i32 %val) { +; GCN-LABEL: test_unknown_flat_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr + ret void +} + +define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 { +; GCN-LABEL: test_flat_store_no_scratch_alloc: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr + ret void +} + +; TODO: handle +define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) { +; GCN-LABEL: test_flat_store_noalias_addrspace: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6} + ret void +} + +; TODO: would be nice to handle too +define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) { +; GCN-SDAG-LABEL: test_flat_store_select: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 +; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE +; GCN-SDAG-NEXT: s_wait_dscnt 0x0 +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_flat_store_select: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 +; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo +; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE +; GCN-GISEL-NEXT: s_wait_dscnt 0x0 +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %a.ascast = addrspacecast ptr addrspace(1) %a to ptr + %b.ascast = addrspacecast ptr addrspace(3) %b to ptr + %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast + store i32 %val, ptr %ptr + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index fd644a3..3a898a9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0xd -; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 -; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 -; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 -; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 -; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 -; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 -; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 -; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 -; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 -; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 -; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 -; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 -; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 -; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE ; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224 ; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-SDAG-NEXT: s_clause 0xd ; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192 ; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208 @@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0xf -; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 -; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 -; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 -; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 -; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 -; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 -; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 -; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 -; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 -; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 -; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 -; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 -; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 -; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 -; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 -; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE ; GCN-GISEL-NEXT: s_wait_xcnt 0x8 ; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4 ; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-GISEL-NEXT: s_clause 0xe ; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48 ; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64 @@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16 ; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0xe @@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x3 -; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 -; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 -; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 -; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE ; GCN-SDAG-NEXT: s_clause 0x7 ; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112 ; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96 @@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x5 -; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 -; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 -; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 -; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 -; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 -; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE ; GCN-GISEL-NEXT: s_clause 0x7 ; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80 ; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 7d85d34..beda16c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1,13 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index d358837..8081a15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) @@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 21465be..d81ec1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1687,7 +1653,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -1701,41 +1667,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -2051,7 +1995,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -2065,41 +2009,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2781,24 +2677,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2827,24 +2721,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2852,24 +2746,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -2930,24 +2824,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2976,24 +2868,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -3001,24 +2893,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -4246,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4318,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4371,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4442,17 +4312,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4463,42 +4333,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: @@ -4645,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4717,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4770,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4841,17 +4689,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4862,42 +4710,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: @@ -5045,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5088,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5124,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5279,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5322,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5358,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5643,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5664,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5747,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5768,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5845,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 37809da..f78ea92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v20, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1964,40 +1962,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2031,40 +2025,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2096,34 +2086,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2162,34 +2148,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index bc50058..0b2818f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: s_movk_i32 s2, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -5031,77 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5109,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5177,77 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5255,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -6298,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index fb1e46d..31a48de 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -1,13 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { +; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -16,6 +57,81 @@ bb: } define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { +; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_nop 7 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 7 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3) @@ -25,6 +141,4 @@ bb: attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} ; GFX942: {{.*}} -; GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 8056881..b25fe83 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v13, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll index 84123e6..393581f 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc25084..5e5e3bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { @@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index c3164b8..f54a383 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s ; Partial reg copy and spill missed during regalloc handled later at frame lowering. @@ -12,17 +12,21 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8 - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %27 + ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy @@ -57,15 +61,17 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8 - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %24 + ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %24 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %22 + ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %22 + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX90A-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index 053038d..382d892 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -1,14 +1,116 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -; FUNC-LABEL: {{^}}ssubo_i64_zext: define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { +; SI-LABEL: ssubo_i64_zext: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_sub_u32 s10, s2, s8 +; SI-NEXT: s_subb_u32 s11, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] +; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ssubo_i64_zext: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_sub_u32 s6, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_subb_u32 s7, s3, s5 +; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: ssubo_i64_zext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ssubo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s2, s6 +; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ssubo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s6, s2, s4 +; GFX11-NEXT: s_subb_u32 s7, s3, s5 +; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] +; GFX11-NEXT: s_xor_b32 s2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ret void } -; FUNC-LABEL: {{^}}s_ssubo_i32: define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { +; SI-LABEL: s_ssubo_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_sub_i32 s12, s8, s9 +; SI-NEXT: s_cmp_gt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9] +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ssubo_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_i32 s6, s4, s5 +; VI-NEXT: s_cmp_gt_i32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lt_i32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_ssubo_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s4, s6, s7 +; GFX9-NEXT: v_sub_i32 v1, s6, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_ssubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_i32 v0, s6, s7 clamp +; GFX10-NEXT: s_sub_i32 s4, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_ssubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_i32 v0, s6, s7 clamp +; GFX11-NEXT: s_sub_i32 s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-NEXT: s_endpgm %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %ssub, 0 %carry = extractvalue { i32, i1 } %ssub, 1 @@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_i32: define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: flat_store_dword v[0:1], v6 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_i32 v3, v1, v2 clamp +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_i32 v3, v1, v2 clamp +; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_i32 v3, v1, v2 clamp +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}s_ssubo_i64: -; GCN: s_sub_u32 -; GCN: s_subb_u32 define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { +; SI-LABEL: s_ssubo_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_sub_u32 s12, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_subb_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ssubo_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_ssubo_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_byte v2, v0, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_ssubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s0, s12, s14 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_xor_b32 s0, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_ssubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s8, s4, s6 +; GFX11-NEXT: s_subb_u32 s9, s5, s7 +; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_i64: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc +; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[6:7], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_byte v6, v0, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX10-NEXT: global_store_byte v6, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] +; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] +; GFX11-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %aptr, align 4 %b = load i64, ptr addrspace(1) %bptr, align 4 %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_v2i32: -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_sub_{{[iu]}}32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_sub_{{[iu]}}32 define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 +; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3 +; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp +; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2 +; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v1, v3 +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX10-NEXT: v_sub_nc_u32_e32 v3, v0, v2 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v1, v3 +; GFX11-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v0, v2 +; GFX11-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index d230ff5..e1574dc 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: @@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_add_u32 s0, s2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_addc_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s4, s2, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s4, s2, s4 +; GFX11-NEXT: s_addc_u32 s5, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i32: @@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 @@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i32_novcc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i32_novcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s6, s4, s6 -; SI-NEXT: s_addc_u32 s7, s5, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_addc_u32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i64: @@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s12, s14 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s6, s4, s6 +; GFX11-NEXT: s_addc_u32 s7, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX10-NEXT: global_store_byte v4, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b8 v4, v0, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr @@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 @@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_short v0, v2, s[8:9] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2 +; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: global_store_short v0, v2, s[8:9] +; GFX10-NEXT: global_store_byte v0, v1, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5] +; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr @@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_add_co_u32 v0, s4, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind @@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_clamp_bit: @@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_dword v1, v0, s[8:9] ; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, s1, s2, s3 +; GFX10-NEXT: s_cmp_eq_u32 s2, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB8_2: ; %exit +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[8:9] +; GFX10-NEXT: global_store_byte v1, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s1, s2, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s0, s1, -1 +; GFX11-NEXT: .LBB8_2: ; %exit +; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_store_b8 v1, v2, s[6:7] +; GFX11-NEXT: s_endpgm entry: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB9_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s4, s5, -1 +; GFX11-NEXT: .LBB9_2: ; %exit +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -813,23 +1161,23 @@ exit: define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) { ; SI-LABEL: sv_uaddo_i128: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; SI-NEXT: v_mov_b32_e32 v6, s1 -; SI-NEXT: v_mov_b32_e32 v7, s2 -; SI-NEXT: v_mov_b32_e32 v8, s3 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3] -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc +; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] +; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_and_b32_e32 v2, 1, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sv_uaddo_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sv_uaddo_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_mov_b16_e32 v2.l, v6.l +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) %carry = extractvalue { i128, i1 } %uadd, 1 %carry.ext = zext i1 %carry to i32 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 7d7f1b4..0289dab 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s - +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: @@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_sub_u32 s0, s2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_subb_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s2, s6 +; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s4, s2, s4 +; GFX11-NEXT: s_subb_u32 s5, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i32: @@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 @@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i32_novcc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i32_novcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s6, s4, s6 -; SI-NEXT: s_subb_u32 s7, s5, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_subb_u32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i64: @@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s0, s12, s14 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s6, s4, s6 +; GFX11-NEXT: s_subb_u32 s7, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX10-NEXT: global_store_byte v4, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b8 v4, v0, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr @@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 @@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_short v0, v2, s[8:9] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2 +; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: global_store_short v0, v2, s[8:9] +; GFX10-NEXT: global_store_byte v0, v1, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5] +; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr @@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_sub_co_u32 v0, s4, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind @@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_clamp_bit: @@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_dword v1, v0, s[8:9] ; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v0, s1, s2, s3 +; GFX10-NEXT: s_cmp_eq_u32 s2, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB8_2: ; %exit +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[8:9] +; GFX10-NEXT: global_store_byte v1, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v0, s1, s2, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s0, s1, -1 +; GFX11-NEXT: .LBB8_2: ; %exit +; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_store_b8 v1, v2, s[6:7] +; GFX11-NEXT: s_endpgm entry: %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_sub_co_u32 v1, s1, v1, v2 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB9_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_sub_co_u32 v1, s5, v1, v2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s4, s5, -1 +; GFX11-NEXT: .LBB9_2: ; %exit +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll index 145f1e4..ff18b32 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll @@ -2,7 +2,7 @@ ; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc. -; CHECK: error: can't find enough VGPRs for wwm-regalloc +; CHECK: error: cannot find enough VGPRs for wwm-regalloc define amdgpu_kernel void @test(i32 %in) { entry: diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll index f3a227c..2fc6790 100644 --- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll +++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll @@ -22,3 +22,11 @@ define void @test() { define void @test_optsize() optsize { ret void } + +; CHECK-LABEL: test_minsize +; ALIGN-CS-16: .p2align 1 +; ALIGN-CS-32: .p2align 2 + +define void @test_minsize() minsize { + ret void +} diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll index 16cc1f3..e5a6aa4 100644 --- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll +++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll @@ -183,7 +183,7 @@ b0: %v11 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v10, <32 x i32> undef) %v12 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v11, i32 2147483647, i32 1) store <64 x i32> %v12, ptr @g0, align 128 - call void (ptr, ...) @f1(ptr @g3) #2 + call void (ptr, ...) @f1(ptr @g3) #3 %v13 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2) %v14 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v13) %v15 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v14, i32 -2147483648, i32 1) @@ -193,7 +193,7 @@ b0: %v17 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v16) %v18 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v17, i32 0, i32 1) store <64 x i32> %v18, ptr @g0, align 128 - call void @f0() #2 + call void @f0() #3 %v19 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1) %v20 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2) %v21 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v19, <32 x i32> %v20) @@ -205,3 +205,4 @@ b0: attributes #0 = { nounwind "use-soft-float"="false" "target-cpu"="hexagonv66" "target-features"="+hvxv66,+hvx-length128b" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind optsize } +attributes #3 = { nounwind minsize } diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll index 5130865..c18c637 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll @@ -436,49 +436,47 @@ entry: define void @buildvector_v32i8_with_constant(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5, i8 %a8, i8 %a9, i8 %a15, i8 %a17, i8 %a18, i8 %a20, i8 %a22, i8 %a23, i8 %a27, i8 %a28, i8 %a31) nounwind { ; CHECK-LABEL: buildvector_v32i8_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: ld.b $t0, $fp, 0 -; CHECK-NEXT: ld.b $t1, $fp, 8 -; CHECK-NEXT: ld.b $t2, $fp, 16 -; CHECK-NEXT: ld.b $t3, $fp, 24 -; CHECK-NEXT: ld.b $t4, $fp, 56 -; CHECK-NEXT: ld.b $t5, $fp, 32 -; CHECK-NEXT: ld.b $t6, $fp, 48 -; CHECK-NEXT: ld.b $t7, $fp, 40 -; CHECK-NEXT: st.b $t4, $sp, 63 -; CHECK-NEXT: st.b $zero, $sp, 61 -; CHECK-NEXT: st.b $t6, $sp, 60 -; CHECK-NEXT: st.b $t7, $sp, 59 -; CHECK-NEXT: st.b $zero, $sp, 56 -; CHECK-NEXT: st.b $t5, $sp, 55 -; CHECK-NEXT: st.b $t3, $sp, 54 -; CHECK-NEXT: st.b $zero, $sp, 53 -; CHECK-NEXT: st.b $t2, $sp, 52 -; CHECK-NEXT: st.b $zero, $sp, 51 -; CHECK-NEXT: st.b $t1, $sp, 50 -; CHECK-NEXT: st.b $t0, $sp, 49 -; CHECK-NEXT: st.b $zero, $sp, 48 -; CHECK-NEXT: st.b $a7, $sp, 47 -; CHECK-NEXT: st.h $zero, $sp, 44 -; CHECK-NEXT: st.b $zero, $sp, 42 -; CHECK-NEXT: st.b $a6, $sp, 41 -; CHECK-NEXT: st.b $a5, $sp, 40 -; CHECK-NEXT: st.b $zero, $sp, 39 -; CHECK-NEXT: st.b $a4, $sp, 37 -; CHECK-NEXT: st.h $zero, $sp, 35 -; CHECK-NEXT: st.b $a3, $sp, 34 -; CHECK-NEXT: st.b $a2, $sp, 33 -; CHECK-NEXT: st.b $a1, $sp, 32 -; CHECK-NEXT: xvld $xr0, $sp, 32 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: ld.b $t0, $sp, 56 +; CHECK-NEXT: ld.b $t1, $sp, 48 +; CHECK-NEXT: ld.b $t2, $sp, 40 +; CHECK-NEXT: ld.b $t3, $sp, 32 +; CHECK-NEXT: ld.b $t4, $sp, 24 +; CHECK-NEXT: ld.b $t5, $sp, 16 +; CHECK-NEXT: ld.b $t6, $sp, 8 +; CHECK-NEXT: ld.b $t7, $sp, 0 +; CHECK-NEXT: xvrepli.b $xr0, 0 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a1, 0 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a2, 1 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a3, 2 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a4, 5 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a5, 8 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a6, 9 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a7, 15 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t7, 1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t6, 2 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t5, 4 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t4, 6 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t3, 7 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t2, 11 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t1, 12 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $t0, 15 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <32 x i8> undef, i8 %a0, i32 0 @@ -624,32 +622,19 @@ entry: define void @buildvector_v16i16_with_constant(ptr %dst, i16 %a2, i16 %a3, i16 %a5, i16 %a6, i16 %a7, i16 %a12, i16 %a13) nounwind { ; CHECK-LABEL: buildvector_v16i16_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: st.h $a7, $sp, 58 -; CHECK-NEXT: st.h $a6, $sp, 56 -; CHECK-NEXT: st.h $a5, $sp, 46 -; CHECK-NEXT: st.h $a4, $sp, 44 -; CHECK-NEXT: st.h $a3, $sp, 42 -; CHECK-NEXT: ori $a3, $zero, 2 -; CHECK-NEXT: st.h $a3, $sp, 40 -; CHECK-NEXT: st.h $a2, $sp, 38 -; CHECK-NEXT: st.h $a1, $sp, 36 -; CHECK-NEXT: lu12i.w $a1, 32 -; CHECK-NEXT: ori $a1, $a1, 2 -; CHECK-NEXT: st.w $a1, $sp, 60 -; CHECK-NEXT: st.w $a1, $sp, 32 -; CHECK-NEXT: lu32i.d $a1, 131074 -; CHECK-NEXT: st.d $a1, $sp, 48 -; CHECK-NEXT: xvld $xr0, $sp, 32 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: xvrepli.h $xr0, 2 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 3 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 5 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a4, 6 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a5, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a6, 4 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a7, 5 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <16 x i16> undef, i16 2, i32 0 @@ -724,24 +709,12 @@ entry: define void @buildvector_v8i32_with_constant(ptr %dst, i32 %a2, i32 %a4, i32 %a5, i32 %a6) nounwind { ; CHECK-LABEL: buildvector_v8i32_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: st.w $zero, $sp, 60 -; CHECK-NEXT: st.w $a4, $sp, 56 -; CHECK-NEXT: st.w $a3, $sp, 52 -; CHECK-NEXT: st.w $a2, $sp, 48 -; CHECK-NEXT: st.w $zero, $sp, 44 -; CHECK-NEXT: st.w $a1, $sp, 40 -; CHECK-NEXT: st.d $zero, $sp, 32 -; CHECK-NEXT: xvld $xr0, $sp, 32 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: xvrepli.b $xr0, 0 +; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 2 +; CHECK-NEXT: xvinsgr2vr.w $xr0, $a2, 4 +; CHECK-NEXT: xvinsgr2vr.w $xr0, $a3, 5 +; CHECK-NEXT: xvinsgr2vr.w $xr0, $a4, 6 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <8 x i32> undef, i32 0, i32 0 @@ -793,21 +766,10 @@ entry: define void @buildvector_v4i64_with_constant(ptr %dst, i64 %a0, i64 %a2) nounwind { ; CHECK-LABEL: buildvector_v4i64_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: st.d $zero, $sp, 56 -; CHECK-NEXT: st.d $a2, $sp, 48 -; CHECK-NEXT: st.d $zero, $sp, 40 -; CHECK-NEXT: st.d $a1, $sp, 32 -; CHECK-NEXT: xvld $xr0, $sp, 32 +; CHECK-NEXT: xvrepli.b $xr0, 0 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 0 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a2, 2 ; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret entry: %ins0 = insertelement <4 x i64> undef, i64 %a0, i32 0 @@ -880,27 +842,17 @@ entry: define void @buildvector_v8f32_with_constant(ptr %dst, float %a1, float %a2, float %a5, float %a7) nounwind { ; CHECK-LABEL: buildvector_v8f32_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: fst.s $fa3, $sp, 60 -; CHECK-NEXT: fst.s $fa2, $sp, 52 -; CHECK-NEXT: fst.s $fa1, $sp, 40 -; CHECK-NEXT: fst.s $fa0, $sp, 36 -; CHECK-NEXT: vldi $vr0, -1280 -; CHECK-NEXT: fst.s $fa0, $sp, 56 +; CHECK-NEXT: # kill: def $f3 killed $f3 def $xr3 +; CHECK-NEXT: # kill: def $f2 killed $f2 def $xr2 +; CHECK-NEXT: # kill: def $f1 killed $f1 def $xr1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 ; CHECK-NEXT: lu12i.w $a1, 262144 -; CHECK-NEXT: lu52i.d $a1, $a1, 1024 -; CHECK-NEXT: st.d $a1, $sp, 44 -; CHECK-NEXT: fst.s $fa0, $sp, 32 -; CHECK-NEXT: xvld $xr0, $sp, 32 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: xvreplgr2vr.w $xr4, $a1 +; CHECK-NEXT: xvinsve0.w $xr4, $xr0, 1 +; CHECK-NEXT: xvinsve0.w $xr4, $xr1, 2 +; CHECK-NEXT: xvinsve0.w $xr4, $xr2, 5 +; CHECK-NEXT: xvinsve0.w $xr4, $xr3, 7 +; CHECK-NEXT: xvst $xr4, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <8 x float> undef, float 2.0, i32 0 @@ -956,21 +908,12 @@ entry: define void @buildvector_v4f64_with_constant(ptr %dst, double %a0, double %a3) nounwind { ; CHECK-LABEL: buildvector_v4f64_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 96 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: fst.d $fa1, $sp, 56 -; CHECK-NEXT: vrepli.b $vr1, 0 -; CHECK-NEXT: vst $vr1, $sp, 40 -; CHECK-NEXT: fst.d $fa0, $sp, 32 -; CHECK-NEXT: xvld $xr0, $sp, 32 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $fp, -96 -; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $xr1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvrepli.b $xr2, 0 +; CHECK-NEXT: xvinsve0.d $xr2, $xr0, 0 +; CHECK-NEXT: xvinsve0.d $xr2, $xr1, 3 +; CHECK-NEXT: xvst $xr2, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <4 x double> undef, double %a0, i32 0 diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll index 78588c5..9517558 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll @@ -307,23 +307,15 @@ entry: define void @buildvector_v16i8_with_constant(ptr %dst, i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) nounwind { ; CHECK-LABEL: buildvector_v16i8_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.b $a7, $sp, 15 -; CHECK-NEXT: st.h $zero, $sp, 13 -; CHECK-NEXT: st.b $a6, $sp, 12 -; CHECK-NEXT: st.b $a5, $sp, 11 -; CHECK-NEXT: st.h $zero, $sp, 9 -; CHECK-NEXT: st.b $a4, $sp, 8 -; CHECK-NEXT: st.b $zero, $sp, 7 -; CHECK-NEXT: st.b $a3, $sp, 6 -; CHECK-NEXT: st.b $zero, $sp, 5 -; CHECK-NEXT: st.b $a2, $sp, 4 -; CHECK-NEXT: st.b $zero, $sp, 3 -; CHECK-NEXT: st.h $zero, $sp, 1 -; CHECK-NEXT: st.b $a1, $sp, 0 -; CHECK-NEXT: vld $vr0, $sp, 0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a1, 0 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a2, 4 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a3, 6 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a4, 8 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a5, 11 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a6, 12 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a7, 15 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <16 x i8> undef, i8 %a0, i32 0 @@ -398,16 +390,12 @@ entry: define void @buildvector_v8i16_with_constant(ptr %dst, i16 %a0, i16 %a3, i16 %a4, i16 %a5) nounwind { ; CHECK-LABEL: buildvector_v8i16_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.h $zero, $sp, 12 -; CHECK-NEXT: st.h $a4, $sp, 10 -; CHECK-NEXT: st.h $a3, $sp, 8 -; CHECK-NEXT: st.h $a2, $sp, 6 -; CHECK-NEXT: st.h $zero, $sp, 2 -; CHECK-NEXT: st.h $a1, $sp, 0 -; CHECK-NEXT: vld $vr0, $sp, 0 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 3 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 4 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a4, 5 ; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret entry: %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0 @@ -459,15 +447,11 @@ entry: define void @buildvector_v4i32_with_constant(ptr %dst, i32 %a0, i32 %a2, i32 %a3) nounwind { ; CHECK-LABEL: buildvector_v4i32_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.w $a3, $sp, 12 -; CHECK-NEXT: st.w $a2, $sp, 8 -; CHECK-NEXT: ori $a2, $zero, 2 -; CHECK-NEXT: st.w $a2, $sp, 4 -; CHECK-NEXT: st.w $a1, $sp, 0 -; CHECK-NEXT: vld $vr0, $sp, 0 +; CHECK-NEXT: vrepli.w $vr0, 2 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 0 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a2, 2 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a3, 3 ; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret entry: %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0 @@ -508,9 +492,8 @@ entry: define void @buildvector_v2i64_with_constant(ptr %dst, i64 %a1) nounwind { ; CHECK-LABEL: buildvector_v2i64_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vinsgr2vr.d $vr0, $zero, 0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a1, 0 -; CHECK-NEXT: vpackev.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -561,15 +544,14 @@ entry: define void @buildvector_v4f32_with_constant(ptr %dst, float %a1, float %a2, float %a3) nounwind { ; CHECK-LABEL: buildvector_v4f32_with_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: fst.s $fa2, $sp, 12 -; CHECK-NEXT: fst.s $fa1, $sp, 8 -; CHECK-NEXT: fst.s $fa0, $sp, 4 -; CHECK-NEXT: movgr2fr.w $fa0, $zero -; CHECK-NEXT: fst.s $fa0, $sp, 0 -; CHECK-NEXT: vld $vr0, $sp, 0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: # kill: def $f2 killed $f2 def $vr2 +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vr1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 16 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 ; CHECK-NEXT: ret entry: %ins0 = insertelement <4 x float> undef, float 0.0, i32 0 diff --git a/llvm/test/CodeGen/Mips/abiflags-soft-float.ll b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll new file mode 100644 index 0000000..01821f2 --- /dev/null +++ b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll @@ -0,0 +1,12 @@ +; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o tmp.o +; RUN: llvm-readobj -A tmp.o | FileCheck %s -check-prefix=OBJ +; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | \ +; RUN: FileCheck %s -check-prefix=ASM + +; OBJ: FP ABI: Soft float +; ASM: .module softfloat + +define dso_local void @asm_is_null() "use-soft-float"="true" { + call void asm sideeffect "", ""() + ret void +} diff --git a/llvm/test/CodeGen/Mips/nan_lowering.ll b/llvm/test/CodeGen/Mips/nan_lowering.ll new file mode 100644 index 0000000..2a11278 --- /dev/null +++ b/llvm/test/CodeGen/Mips/nan_lowering.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=mips-linux-gnu -mattr=-nan2008 < %s | FileCheck %s +; RUN: llc -mtriple=mips-linux-gnu -mattr=+nan2008 < %s | FileCheck %s + +; Make sure that lowering does not corrupt the value of NaN values, +; regardless of what the NaN mode is. + +define float @test1() { +; CHECK: .4byte 0x7fc00000 + ret float bitcast (i32 u0x7fc00000 to float) +} + +define float @test2() { +; CHECK: .4byte 0x7fc00001 + ret float bitcast (i32 u0x7fc00001 to float) +} + +define float @test3() { +; CHECK: .4byte 0x7f800000 + ret float bitcast (i32 u0x7f800000 to float) +} + +define float @test4() { +; CHECK: .4byte 0x7f800001 + ret float bitcast (i32 u0x7f800001 to float) +} diff --git a/llvm/test/CodeGen/Mips/qnan.ll b/llvm/test/CodeGen/Mips/qnan.ll deleted file mode 100644 index e5b4aa1..0000000 --- a/llvm/test/CodeGen/Mips/qnan.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu < %s -o - | FileCheck %s -check-prefixes=MIPS_Legacy -; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu -mattr=+nan2008 < %s -o - | FileCheck %s -check-prefixes=MIPS_NaN2008 - -define dso_local float @nan(float noundef %a, float noundef %b) local_unnamed_addr #0 { -; MIPS_Legacy: $CPI0_0: -; MIPS_Legacy-NEXT: .4byte 0x7fa00000 # float NaN - -; MIPS_NaN2008: $CPI0_0: -; MIPS_NaN2008-NEXT: .4byte 0x7fc00000 # float NaN - -entry: - %0 = tail call float @llvm.minimum.f32(float %a, float %b) - ret float %0 -} diff --git a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll new file mode 100644 index 0000000..238e200 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=powerpc64le < %s | FileCheck %s + +define void @test(ptr %p1, ptr %p2) nounwind { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stdu 1, -224(1) +; CHECK-NEXT: li 5, 48 +; CHECK-NEXT: std 0, 240(1) +; CHECK-NEXT: std 27, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: li 27, 16 +; CHECK-NEXT: std 28, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: li 29, 32 +; CHECK-NEXT: li 28, 48 +; CHECK-NEXT: stxvd2x 56, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 64 +; CHECK-NEXT: std 30, 208(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 30, 4 +; CHECK-NEXT: stxvd2x 57, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 80 +; CHECK-NEXT: stxvd2x 58, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 96 +; CHECK-NEXT: lxvd2x 58, 0, 3 +; CHECK-NEXT: stxvd2x 59, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 112 +; CHECK-NEXT: lxvd2x 59, 3, 27 +; CHECK-NEXT: stxvd2x 60, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 128 +; CHECK-NEXT: stxvd2x 61, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 144 +; CHECK-NEXT: stxvd2x 62, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: li 5, 160 +; CHECK-NEXT: lxvd2x 62, 3, 28 +; CHECK-NEXT: stxvd2x 63, 1, 5 # 16-byte Folded Spill +; CHECK-NEXT: lxvd2x 63, 3, 29 +; CHECK-NEXT: xxswapd 57, 58 +; CHECK-NEXT: xxswapd 1, 59 +; CHECK-NEXT: xxswapd 60, 62 +; CHECK-NEXT: xxswapd 61, 63 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 56, 1 +; CHECK-NEXT: xxlor 1, 59, 59 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 0, 1 +; CHECK-NEXT: xxlor 1, 60, 60 +; CHECK-NEXT: xxmrgld 59, 0, 56 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 60, 1 +; CHECK-NEXT: xxlor 1, 62, 62 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 0, 1 +; CHECK-NEXT: xxlor 1, 61, 61 +; CHECK-NEXT: xxmrgld 62, 0, 60 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 61, 1 +; CHECK-NEXT: xxlor 1, 63, 63 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 0, 1 +; CHECK-NEXT: xxlor 1, 57, 57 +; CHECK-NEXT: xxmrgld 63, 0, 61 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd 61, 1 +; CHECK-NEXT: xxlor 1, 58, 58 +; CHECK-NEXT: bl roundeven +; CHECK-NEXT: nop +; CHECK-NEXT: li 3, 160 +; CHECK-NEXT: stxvd2x 63, 30, 29 +; CHECK-NEXT: xxswapd 0, 1 +; CHECK-NEXT: stxvd2x 62, 30, 28 +; CHECK-NEXT: stxvd2x 59, 30, 27 +; CHECK-NEXT: ld 29, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 28, 192(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 27, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 144 +; CHECK-NEXT: xxmrgld 0, 0, 61 +; CHECK-NEXT: lxvd2x 62, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 128 +; CHECK-NEXT: stxvd2x 0, 0, 30 +; CHECK-NEXT: ld 30, 208(1) # 8-byte Folded Reload +; CHECK-NEXT: lxvd2x 61, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 112 +; CHECK-NEXT: lxvd2x 60, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 96 +; CHECK-NEXT: lxvd2x 59, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 80 +; CHECK-NEXT: lxvd2x 58, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 64 +; CHECK-NEXT: lxvd2x 57, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: li 3, 48 +; CHECK-NEXT: lxvd2x 56, 1, 3 # 16-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 224 +; CHECK-NEXT: ld 0, 16(1) +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %v = load <8 x double>, ptr %p1, align 64 + %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %v) + store <8 x double> %res, ptr %p2, align 64 + ret void +} + +declare <8 x double> @llvm.roundeven.v8f64(<8 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 021c737..fba592d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -634,3 +634,19 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec) ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results } + +define { <8 x float>, <8 x float> } @deinterleave_unrelated(<16 x float> %arg) { +; CHECK-LABEL: deinterleave_unrelated: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v12, a0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: ret +entry: + %abs = call <16 x float> @llvm.fabs(<16 x float> %arg) + %res = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %abs) + ret { <8 x float>, <8 x float> } %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 0a96e4f..5b1746d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -3744,3 +3744,61 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc %ext = extractvalue {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res, 5 ret <vscale x 1 x float> %ext } + + +define { <8 x float>, <8 x float> } @interleave_deinterleave2(<8 x float> %a, <8 x float> %b) { +; V-LABEL: interleave_deinterleave2: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; V-NEXT: vwaddu.vv v12, v8, v10 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v12, a0, v10 +; V-NEXT: li a0, 32 +; V-NEXT: vnsrl.wx v10, v12, a0 +; V-NEXT: vnsrl.wi v8, v12, 0 +; V-NEXT: ret +; +; ZIP-LABEL: interleave_deinterleave2: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; ZIP-NEXT: vmv2r.v v12, v10 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: ri.vzip2a.vv v16, v8, v12 +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: vnsrl.wx v10, v16, a0 +; ZIP-NEXT: vnsrl.wi v8, v16, 0 +; ZIP-NEXT: ret +entry: + %0 = call <16 x float> @llvm.vector.interleave2.v16f32(<8 x float> %a, <8 x float> %b) + %1 = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %0) + ret { <8 x float>, <8 x float> } %1 +} + +define <16 x float> @deinterleave_interleave2(<16 x float> %arg) { +; V-LABEL: deinterleave_interleave2: +; V: # %bb.0: # %entry +; V-NEXT: li a0, 32 +; V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; V-NEXT: vnsrl.wi v12, v8, 0 +; V-NEXT: vnsrl.wx v14, v8, a0 +; V-NEXT: vwaddu.vv v8, v12, v14 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v14 +; V-NEXT: ret +; +; ZIP-LABEL: deinterleave_interleave2: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: vnsrl.wi v12, v8, 0 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vnsrl.wx v16, v8, a0 +; ZIP-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v8, v12, v16 +; ZIP-NEXT: ret +entry: + %0 = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %arg) + %a = extractvalue { <8 x float>, <8 x float> } %0, 0 + %b = extractvalue { <8 x float>, <8 x float> } %0, 1 + %res = call <16 x float> @llvm.vector.interleave2.v16f32(<8 x float> %a, <8 x float> %b) + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/embed-bitcode.ll b/llvm/test/CodeGen/X86/embed-bitcode.ll index 0d66ba8..d4af954 100644 --- a/llvm/test/CodeGen/X86/embed-bitcode.ll +++ b/llvm/test/CodeGen/X86/embed-bitcode.ll @@ -1,10 +1,23 @@ ; RUN: llc -filetype=obj -mtriple=x86_64 %s -o %t ; RUN: llvm-readelf -S %t | FileCheck %s +; RUN: llc -filetype=obj -mtriple=x86_64-pc-windows-msvc %s -o %t +; RUN: llvm-readobj -S %t | FileCheck %s --check-prefix=COFF ; CHECK: .text PROGBITS 0000000000000000 [[#%x,OFF:]] 000000 00 AX 0 ; CHECK-NEXT: .llvmbc PROGBITS 0000000000000000 [[#%x,OFF:]] 000004 00 0 ; CHECK-NEXT: .llvmcmd PROGBITS 0000000000000000 [[#%x,OFF:]] 000005 00 0 +; COFF: Name: .llvmbc (2E 6C 6C 76 6D 62 63 00) +; COFF: Characteristics [ +; COFF-NEXT: IMAGE_SCN_ALIGN_1BYTES +; COFF-NEXT: IMAGE_SCN_MEM_DISCARDABLE +; COFF-NEXT: ] +; COFF: Name: .llvmcmd (2E 6C 6C 76 6D 63 6D 64) +; COFF: Characteristics [ +; COFF-NEXT: IMAGE_SCN_ALIGN_1BYTES +; COFF-NEXT: IMAGE_SCN_MEM_DISCARDABLE +; COFF-NEXT: ] + @llvm.embedded.module = private constant [4 x i8] c"BC\C0\DE", section ".llvmbc", align 1 @llvm.cmdline = private constant [5 x i8] c"-cc1\00", section ".llvmcmd", align 1 @llvm.compiler.used = appending global [2 x ptr] [ptr @llvm.embedded.module, ptr @llvm.cmdline], section "llvm.metadata" diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll new file mode 100644 index 0000000..960bbf5 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-fpclass.ll @@ -0,0 +1,526 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL +; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL +; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL + +; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included. + +define i1 @isnone_f(float %x) { +; X86-SDAGISEL-LABEL: isnone_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: xorl %eax, %eax +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isnone_f: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isnone_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: xorl %eax, %eax +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0) + ret i1 %0 +} + +define i1 @isany_f(float %x) { +; X86-SDAGISEL-LABEL: isany_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movb $1, %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isany_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movb $1, %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isany_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: movb $1, %al +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023) + ret i1 %0 +} + +define i1 @issignaling_f(float %x) { +; X86-SDAGISEL-LABEL: issignaling_f: +; X86-SDAGISEL: # %bb.0: +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setl %cl +; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: andb %cl, %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: issignaling_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setl %cl +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: issignaling_f: +; X86-FASTISEL: # %bb.0: +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-FASTISEL-NEXT: setl %cl +; X86-FASTISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-FASTISEL-NEXT: setge %al +; X86-FASTISEL-NEXT: andb %cl, %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl + %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" + ret i1 %a0 +} + + define i1 @isquiet_f(float %x) { +; X86-SDAGISEL-LABEL: isquiet_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isquiet_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isquiet_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-FASTISEL-NEXT: setge %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl + entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" + ret i1 %0 +} + +define i1 @not_isquiet_f(float %x) { +; X86-SDAGISEL-LABEL: not_isquiet_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setl %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: not_isquiet_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setl %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: not_isquiet_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-FASTISEL-NEXT: setl %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" + ret i1 %0 +} + +define i1 @isinf_f(float %x) { +; X86-SDAGISEL-LABEL: isinf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isinf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isinf_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-FASTISEL-NEXT: sete %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" + ret i1 %0 +} + +define i1 @not_isinf_f(float %x) { +; X86-SDAGISEL-LABEL: not_isinf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setne %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: not_isinf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: not_isinf_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-FASTISEL-NEXT: setne %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" + ret i1 %0 +} + +define i1 @is_plus_inf_f(float %x) { +; X86-SDAGISEL-LABEL: is_plus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: is_plus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: is_plus_inf_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000 +; X86-FASTISEL-NEXT: sete %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" + ret i1 %0 +} + +define i1 @is_minus_inf_f(float %x) { +; X86-SDAGISEL-LABEL: is_minus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: is_minus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: is_minus_inf_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000 +; X86-FASTISEL-NEXT: sete %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" + ret i1 %0 +} + +define i1 @not_is_minus_inf_f(float %x) { +; X86-SDAGISEL-LABEL: not_is_minus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-SDAGISEL-NEXT: setne %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: not_is_minus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: setne %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: not_is_minus_inf_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000 +; X86-FASTISEL-NEXT: setne %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" + ret i1 %0 +} + +define i1 @isfinite_f(float %x) { +; X86-SDAGISEL-LABEL: isfinite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setl %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isfinite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isfinite_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-FASTISEL-NEXT: setl %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" + ret i1 %0 +} + +define i1 @not_isfinite_f(float %x) { +; X86-SDAGISEL-LABEL: not_isfinite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: not_isfinite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setge %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: not_isfinite_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-FASTISEL-NEXT: andl (%esp), %eax +; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-FASTISEL-NEXT: setge %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" + ret i1 %0 +} + +define i1 @is_plus_finite_f(float %x) { +; X86-SDAGISEL-LABEL: is_plus_finite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setb %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: is_plus_finite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setb %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: is_plus_finite_f: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: pushl %eax +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 +; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstps (%esp) +; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000 +; X86-FASTISEL-NEXT: setb %al +; X86-FASTISEL-NEXT: popl %ecx +; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" + ret i1 %0 +} + +define i1 @isnone_d(double %x) nounwind { +; X86-SDAGISEL-LABEL: isnone_d: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: xorl %eax, %eax +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isnone_d: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isnone_d: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: fldl {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: xorl %eax, %eax +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0) + ret i1 %0 +} + +define i1 @isany_d(double %x) nounwind { +; X86-SDAGISEL-LABEL: isany_d: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movb $1, %al +; X86-SDAGISEL-NEXT: retl +; +; X64-LABEL: isany_d: +; X64: # %bb.0: # %entry +; X64-NEXT: movb $1, %al +; X64-NEXT: retq +; +; X86-FASTISEL-LABEL: isany_d: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: fldl {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: movb $1, %al +; X86-FASTISEL-NEXT: retl +entry: + %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023) + ret i1 %0 +} + +define i1 @isnone_f80(x86_fp80 %x) nounwind { +; X86-SDAGISEL-LABEL: isnone_f80: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: xorl %eax, %eax +; X86-SDAGISEL-NEXT: retl +; +; X64-SDAGISEL-LABEL: isnone_f80: +; X64-SDAGISEL: # %bb.0: # %entry +; X64-SDAGISEL-NEXT: xorl %eax, %eax +; X64-SDAGISEL-NEXT: retq +; +; X86-FASTISEL-LABEL: isnone_f80: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: fldt {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: xorl %eax, %eax +; X86-FASTISEL-NEXT: retl +; +; X64-FASTISEL-LABEL: isnone_f80: +; X64-FASTISEL: # %bb.0: # %entry +; X64-FASTISEL-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-FASTISEL-NEXT: fstp %st(0) +; X64-FASTISEL-NEXT: xorl %eax, %eax +; X64-FASTISEL-NEXT: retq +entry: +%0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0) +ret i1 %0 +} + +define i1 @isany_f80(x86_fp80 %x) nounwind { +; X86-SDAGISEL-LABEL: isany_f80: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movb $1, %al +; X86-SDAGISEL-NEXT: retl +; +; X64-SDAGISEL-LABEL: isany_f80: +; X64-SDAGISEL: # %bb.0: # %entry +; X64-SDAGISEL-NEXT: movb $1, %al +; X64-SDAGISEL-NEXT: retq +; +; X86-FASTISEL-LABEL: isany_f80: +; X86-FASTISEL: # %bb.0: # %entry +; X86-FASTISEL-NEXT: fldt {{[0-9]+}}(%esp) +; X86-FASTISEL-NEXT: fstp %st(0) +; X86-FASTISEL-NEXT: movb $1, %al +; X86-FASTISEL-NEXT: retl +; +; X64-FASTISEL-LABEL: isany_f80: +; X64-FASTISEL: # %bb.0: # %entry +; X64-FASTISEL-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-FASTISEL-NEXT: fstp %st(0) +; X64-FASTISEL-NEXT: movb $1, %al +; X64-FASTISEL-NEXT: retq +entry: + %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023) + ret i1 %0 +} diff --git a/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir new file mode 100644 index 0000000..e272e7e --- /dev/null +++ b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir @@ -0,0 +1,128 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s +# +# Check that only the computed gotos are duplicated aggressively. +# +--- | + @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)] + declare i64 @f0() + declare i64 @f1() + declare i64 @f2() + declare i64 @f3() + declare i64 @f4() + declare i64 @f5() + define void @computed_goto() { + start: + ret void + bb1: + ret void + bb2: + ret void + bb3: + ret void + bb4: + ret void + } + define void @jump_table() { ret void } + define void @jump_table_pic() { ret void } +... +--- +name: computed_goto +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: computed_goto + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]] + ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1): + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]] + ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2): + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]] + ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3): + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]] + ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4): + ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]] + ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg + bb.0: + successors: %bb.5(0x80000000) + + CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + %0:gr64 = COPY $rax + %6:gr64_nosp = COPY %0 + JMP_1 %bb.5 + + bb.1.bb1 (ir-block-address-taken %ir-block.bb1): + successors: %bb.5(0x80000000) + + CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + %1:gr64 = COPY $rax + %6:gr64_nosp = COPY %1 + JMP_1 %bb.5 + + bb.2.bb2 (ir-block-address-taken %ir-block.bb2): + successors: %bb.5(0x80000000) + + CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + %2:gr64 = COPY $rax + %6:gr64_nosp = COPY %2 + JMP_1 %bb.5 + + bb.3.bb3 (ir-block-address-taken %ir-block.bb3): + successors: %bb.5(0x80000000) + + CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + %3:gr64 = COPY $rax + %6:gr64_nosp = COPY %3 + JMP_1 %bb.5 + + bb.4.bb4 (ir-block-address-taken %ir-block.bb4): + successors: %bb.5(0x80000000) + + CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + %4:gr64 = COPY $rax + %6:gr64_nosp = COPY %4 + + bb.5: + successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000) + + %5:gr64_nosp = COPY %6 + JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg +... |