aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-03-06 09:36:01 +0530
committerGitHub <noreply@github.com>2024-03-06 09:36:01 +0530
commit0f3628a93749433df51b763ff675152d82a25973 (patch)
treeab89dc867de1894a0609f01885317e1ccf20c961
parenta30233f5765071c8b269189758e8b907e19c4724 (diff)
downloadllvm-0f3628a93749433df51b763ff675152d82a25973.zip
llvm-0f3628a93749433df51b763ff675152d82a25973.tar.gz
llvm-0f3628a93749433df51b763ff675152d82a25973.tar.bz2
AMDGPU: Correct cycle counts for f64 mfma on gfx940 (#83782)
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp54
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td4
-rw-r--r--llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s38
-rw-r--r--llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s19
4 files changed, 79 insertions, 36 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 00fa93c..7bed0d8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2538,23 +2538,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
break;
case 8:
NeedWaitStates =
- ST.hasGFX940Insts()
- ? isXDL(ST, *MFMA)
- ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
- : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
- : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+ isDGEMM(MFMA->getOpcode())
+ ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
+ : DMFMA16x16WriteVgprVALUReadWaitStates
+ : ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
+ : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
break;
case 16: [[fallthrough]];
default:
+ assert(!isDGEMM(MFMA->getOpcode()));
NeedWaitStates =
- isDGEMM(MFMA->getOpcode())
- ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
- : DMFMA16x16WriteVgprVALUReadWaitStates
- : ST.hasGFX940Insts()
- ? isXDL(ST, *MFMA)
- ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
- : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
- : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+ ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
+ : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
+ : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
break;
}
@@ -2633,21 +2634,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
: GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
break;
case 8:
- NeedWaitStates = ST.hasGFX940Insts()
- ? isXDL(ST, *MFMA)
- ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
- : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
- : SMFMA16x16WriteVgprVALUWawWaitStates;
+ NeedWaitStates =
+ isDGEMM(MFMA->getOpcode()) ? DMFMA16x16WriteVgprVALUWriteWaitStates
+ :
+
+ ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA) ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
+ : SMFMA16x16WriteVgprVALUWawWaitStates;
break;
case 16: [[fallthrough]];
default:
- NeedWaitStates = isDGEMM(MFMA->getOpcode())
- ? DMFMA16x16WriteVgprVALUWriteWaitStates
- : ST.hasGFX940Insts()
- ? isXDL(ST, *MFMA)
- ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
- : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
- : SMFMA32x32WriteVgprVALUWawWaitStates;
+ assert(!isDGEMM(MFMA->getOpcode()));
+ NeedWaitStates =
+ ST.hasGFX940Insts()
+ ? isXDL(ST, *MFMA)
+ ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
+ : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
+ : SMFMA32x32WriteVgprVALUWawWaitStates;
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b0e8e41..a60b1f2 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ let ReleaseAtCycles = [4] in
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
- def : HWVALUWriteRes<Write8PassDGEMM, 16>;
+ let ReleaseAtCycles = [8] in
+ def : HWVALUWriteRes<Write8PassDGEMM, 8>;
let ReleaseAtCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
new file mode 100644
index 0000000..6b4ddb3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - HWBranch
+# CHECK-NEXT: [1] - HWExport
+# CHECK-NEXT: [2] - HWLGKM
+# CHECK-NEXT: [3] - HWSALU
+# CHECK-NEXT: [4] - HWVALU
+# CHECK-NEXT: [5] - HWVMEM
+# CHECK-NEXT: [6] - HWXDL
+
+# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
+
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
+
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
index e7ddeee..0e1efbe 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s
@@ -2,7 +2,7 @@
# CHECK: Iterations: 1
# CHECK: Instructions: 78
-# CHECK: Total Cycles: 699
+# CHECK: Total Cycles: 701
# CHECK: Total uOps: 78
v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
@@ -128,11 +128,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
# CHECK-NEXT:[6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-
-# CHECK: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
# CHECK: Resources:
# CHECK: [0] - HWBranch
@@ -148,10 +147,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
+# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]