diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2024-03-06 09:36:01 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-06 09:36:01 +0530 |
commit | 0f3628a93749433df51b763ff675152d82a25973 (patch) | |
tree | ab89dc867de1894a0609f01885317e1ccf20c961 | |
parent | a30233f5765071c8b269189758e8b907e19c4724 (diff) | |
download | llvm-0f3628a93749433df51b763ff675152d82a25973.zip llvm-0f3628a93749433df51b763ff675152d82a25973.tar.gz llvm-0f3628a93749433df51b763ff675152d82a25973.tar.bz2 |
AMDGPU: Correct cycle counts for f64 mfma on gfx940 (#83782)
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 54 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SISchedule.td | 4 | ||||
-rw-r--r-- | llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s | 38 | ||||
-rw-r--r-- | llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s | 19 |
4 files changed, 79 insertions, 36 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 00fa93c..7bed0d8 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2538,23 +2538,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { break; case 8: NeedWaitStates = - ST.hasGFX940Insts() - ? isXDL(ST, *MFMA) - ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates - : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates - : SMFMA16x16WriteVgprVALUMemExpReadWaitStates; + isDGEMM(MFMA->getOpcode()) + ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates + : DMFMA16x16WriteVgprVALUReadWaitStates + : ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates + : SMFMA16x16WriteVgprVALUMemExpReadWaitStates; break; case 16: [[fallthrough]]; default: + assert(!isDGEMM(MFMA->getOpcode())); NeedWaitStates = - isDGEMM(MFMA->getOpcode()) - ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates - : DMFMA16x16WriteVgprVALUReadWaitStates - : ST.hasGFX940Insts() - ? isXDL(ST, *MFMA) - ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates - : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates - : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; + ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates + : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; break; } @@ -2633,21 +2634,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates; break; case 8: - NeedWaitStates = ST.hasGFX940Insts() - ? isXDL(ST, *MFMA) - ? GFX940_XDL8PassWriteVgprVALUWawWaitStates - : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates - : SMFMA16x16WriteVgprVALUWawWaitStates; + NeedWaitStates = + isDGEMM(MFMA->getOpcode()) ? DMFMA16x16WriteVgprVALUWriteWaitStates + : + + ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) ? GFX940_XDL8PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates + : SMFMA16x16WriteVgprVALUWawWaitStates; break; case 16: [[fallthrough]]; default: - NeedWaitStates = isDGEMM(MFMA->getOpcode()) - ? DMFMA16x16WriteVgprVALUWriteWaitStates - : ST.hasGFX940Insts() - ? isXDL(ST, *MFMA) - ? GFX940_XDL16PassWriteVgprVALUWawWaitStates - : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates - : SMFMA32x32WriteVgprVALUWawWaitStates; + assert(!isDGEMM(MFMA->getOpcode())); + NeedWaitStates = + ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL16PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates + : SMFMA32x32WriteVgprVALUWawWaitStates; break; } diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index b0e8e41..a60b1f2 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -165,8 +165,10 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<WriteTrans32, 4>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; + let ReleaseAtCycles = [4] in def : HWVALUWriteRes<Write4PassDGEMM, 4>; - def : HWVALUWriteRes<Write8PassDGEMM, 16>; + let ReleaseAtCycles = [8] in + def : HWVALUWriteRes<Write8PassDGEMM, 8>; let ReleaseAtCycles = [2] in def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s new file mode 100644 index 0000000..6b4ddb3 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx90a-mfma.s @@ -0,0 +1,38 @@ +# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] +# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1] +# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] +# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7] + + +# CHECK: Resources: +# CHECK-NEXT: [0] - HWBranch +# CHECK-NEXT: [1] - HWExport +# CHECK-NEXT: [2] - HWLGKM +# CHECK-NEXT: [3] - HWSALU +# CHECK-NEXT: [4] - HWVALU +# CHECK-NEXT: [5] - HWVMEM +# CHECK-NEXT: [6] - HWXDL + +# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1] +# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] +# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7] +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1] + + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7] + diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s index e7ddeee..0e1efbe 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s @@ -2,7 +2,7 @@ # CHECK: Iterations: 1 # CHECK: Instructions: 78 -# CHECK: Total Cycles: 699 +# CHECK: Total Cycles: 701 # CHECK: Total uOps: 78 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5] @@ -128,11 +128,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 # CHECK-NEXT:[6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: - -# CHECK: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] -# CHECK-NEXT: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] -# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] -# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] +# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] +# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] +# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] +# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] # CHECK: Resources: # CHECK: [0] - HWBranch @@ -148,10 +147,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5] # CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33] # CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] -# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3] +# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3] +# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] +# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7] # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3] # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15] |