aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td')
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td525
1 files changed, 411 insertions, 114 deletions
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 24ebbc3..1cbb6db 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
}
+// Used for: FP FMA operations, complex FP ops
+class Get6678Latency<string mx> {
+ int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
+}
+
//===----------------------------------------------------------------------===//
class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
@@ -120,6 +125,33 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
defvar SMX60VLEN = 256;
defvar SMX60DLEN = !div(SMX60VLEN, 2);
+class SMX60GetLMulCycles<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 1,
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ !eq(mx, "MF2") : 1,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+}
+
+class SMX60GetVLMAX<string mx, int sew> {
+ defvar LMUL = SMX60GetLMulCycles<mx>.c;
+ int val = !cond(
+ !eq(mx, "MF2") : !div(!div(SMX60VLEN, 2), sew),
+ !eq(mx, "MF4") : !div(!div(SMX60VLEN, 4), sew),
+ !eq(mx, "MF8") : !div(!div(SMX60VLEN, 8), sew),
+ true: !div(!mul(SMX60VLEN, LMUL), sew)
+ );
+}
+
+// Latency for segmented loads and stores are calculated as vl * nf.
+class SMX60SegmentedLdStCycles<string mx, int sew, int nf> {
+ int c = !mul(SMX60GetVLMAX<mx, sew>.val, nf);
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -362,23 +394,43 @@ foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
// Unit-stride loads and stores
- defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+ defvar VLDELatAndOcc = ConstValueUntilLMULThenDoubleBase<"M2", 3, 4, mx>.c;
+ let Latency = VLDELatAndOcc, ReleaseAtCycles = [VLDELatAndOcc] in {
+ defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
+ }
+ defvar VSTELatAndOcc = GetLMULValue<[2, 2, 2, 3, 4, 8, 19], mx>.c;
+ let Latency = VSTELatAndOcc, ReleaseAtCycles = [VSTELatAndOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+ }
+ defvar VLDFFLatAndOcc = GetLMULValue<[4, 4, 4, 5, 7, 11, 19], mx>.c;
+ let Latency = VLDFFLatAndOcc, ReleaseAtCycles = [VLDFFLatAndOcc] in {
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
+ }
// Mask loads and stores
- defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
- defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+ let ReleaseAtCycles = [2] in {
+ defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase>;
+ }
+ let Latency = 2, ReleaseAtCycles = [2] in {
+ defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase>;
+ }
// Strided and indexed loads and stores
foreach eew = [8, 16, 32, 64] in {
- defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defvar StridedLdStLatAndOcc = SMX60GetVLMAX<mx, eew>.val;
+ let Latency = StridedLdStLatAndOcc, ReleaseAtCycles = [StridedLdStLatAndOcc] in {
+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
- defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defvar IndexedLdStLatAndOcc = !div(SMX60GetVLMAX<mx, eew>.val, 2);
+ let Latency = IndexedLdStLatAndOcc, ReleaseAtCycles = [IndexedLdStLatAndOcc] in {
+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
}
}
@@ -388,51 +440,67 @@ foreach mx = SchedMxList in {
foreach eew = [8, 16, 32, 64] in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- // Unit-stride segmented
- defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
-
- // Strided/indexed segmented
- defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
-
- // Indexed segmented
- defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defvar SegmentedLdStLatAndOcc = SMX60SegmentedLdStCycles<mx, eew, nf>.c;
+ let Latency = SegmentedLdStLatAndOcc, ReleaseAtCycles = [SegmentedLdStLatAndOcc] in {
+ // Unit-stride segmented
+ defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Strided/indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
}
}
}
// Whole register move/load/store
foreach LMul = [1, 2, 4, 8] in {
- def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
- def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+ defvar WholeRegLdStLatAndOcc = !if(!eq(LMul, 1), 3, !mul(LMul, 2));
+ let Latency = WholeRegLdStLatAndOcc, ReleaseAtCycles = [WholeRegLdStLatAndOcc] in {
+ def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
+ def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+ }
- def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+ defvar VMovLatAndOcc = !if(!eq(LMul, 1), 4, !mul(LMul, 2));
+ let Latency = VMovLatAndOcc, ReleaseAtCycles = [VMovLatAndOcc] in {
+ def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+ }
}
// 11. Vector Integer Arithmetic Instructions
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
}
+ // Latency of vadd, vsub, vrsub: 4/4/5/8
+ // ReleaseAtCycles of vadd, vsub, vrsub: 1/2/4/8
+ // Latency of vand, vor, vxor: 4/4/8/16
+ // ReleaseAtCycles of vand, vor, vxor: 2/4/8/16
+ // They are grouped together, so we used the worst case 4/4/8/16 and 2/4/8/16
+ // TODO: use InstRW to override individual instructions' scheduling data
defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
- let Latency = VIALULat, ReleaseAtCycles = [4] in {
- // Pattern of vadd, vsub, vrsub: 4/4/5/8
- // Pattern of vand, vor, vxor: 4/4/8/16
- // They are grouped together, so we used the worst case 4/4/8/16
- // TODO: use InstRW to override individual instructions' scheduling data
+ defvar VIALUOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VIALULat, ReleaseAtCycles = [VIALUOcc] in {
defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+ defvar VILogicalLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VILogicalOcc = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
+ let Latency = VILogicalLat, ReleaseAtCycles = [VILogicalOcc] in {
defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
@@ -449,7 +517,9 @@ foreach mx = SchedMxList in {
defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
}
- let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ // Slightly increase Occ when LMUL == M8
+ defvar VICmpCarryOcc = GetLMULValue<[1, 1, 1, 2, 4, 8, 18], mx>.c;
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VICmpCarryOcc] in {
defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
@@ -458,10 +528,14 @@ foreach mx = SchedMxList in {
defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
}
- // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
+ // Latency of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
// e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
// TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
- let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
+ defvar VIMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c;
+ // ReleaseAtCycles for vnmsac/vnmsub is 1/1/1/1/2/5 but we use the worse case
+ // here since they are grouped together with vmacc/vmadd/vmul/vmulh.
+ defvar VIMulOcc = ConstOneUntilM1ThenDouble<mx>.c;
+ let Latency = VIMulLat, ReleaseAtCycles = [VIMulOcc] in {
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -475,7 +549,8 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defvar VIWideningOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [VIWideningOcc] in {
defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
@@ -497,8 +572,9 @@ foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
- let Latency = VIDivLat, ReleaseAtCycles = [12] in {
+ // Not pipelined
+ defvar VIDivLatAndOcc = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
+ let Latency = VIDivLatAndOcc, ReleaseAtCycles = [VIDivLatAndOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
}
@@ -510,7 +586,8 @@ foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
- let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
+ defvar VNarrowingOcc = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
+ let Latency = VNarrowingLat, ReleaseAtCycles = [VNarrowingOcc] in {
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
@@ -558,39 +635,71 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
- }
-}
+ defvar VFALULat = Get4458Latency<mx>.c;
+ defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c;
+ let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
-foreach mx = SchedMxListF in {
- foreach sew = SchedSEWSet<mx, isF=1>.val in {
- defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ // Slightly increased latency for sew == 64
+ defvar VFMulVLat = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
+ Get4458Latency<mx>.c);
+ let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+ // VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
+ let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
- defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ // The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
+ // ReleaseAtCycles takes one extra cycle for the vfn* variants.
+ // TODO: Should we split them?
+ // TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
+ defvar VFMulAddLatency = !if(!eq(sew, 64),
+ Get6678Latency<mx>.c,
+ ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
+ );
+ let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+ // Slightly increased ReleaseAtCycles for M8: 18
+ defvar VFCmpOcc = !if(!eq(mx, "M8"),
+ !add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
+ ConstOneUntilMF2ThenDouble<mx>.c
+ );
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+ }
- defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
+ defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
// Widening
@@ -598,27 +707,73 @@ foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
+ let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxListFW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
- defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
+ let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
+ let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
+ // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
+ // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
+ // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
+ // We use the worst-case
+ defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
+ defvar VFWALUOcc = !if(!eq(mx, "M4"),
+ !add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
+ ConstOneUntilMF4ThenDouble<mx>.c
+ );
+ // TODO: Split .wf/.wv variants into separate scheduling classes
+ let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ // Slightly increased latency for SEW == 32
+ defvar VFWMullOcc = !if(!eq(sew, 32),
+ GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c,
+ ConstOneUntilMF2ThenDouble<mx>.c
+ );
+ defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
+ let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ // Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
+ defvar VFWMulAddVLat = !if(!eq(sew, 16),
+ ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
+ Get6678Latency<mx>.c
+ );
+ let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -626,15 +781,23 @@ foreach mx = SchedMxListFW in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
+ let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+ defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+ defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
+ let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -643,9 +806,35 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ // Compute ReleaseAtCycles based on SEW
+ // Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144
+ // Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320
+ // We use the worst-case, vfdiv.vf is penalized in e64
+ // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
+ defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
+ defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor);
+ let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+
+ defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
+ defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor);
+ let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
+ defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor);
+ let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -654,8 +843,17 @@ foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+ defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+ let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+
+ // Pattern for vredsum: 5/5/5/7/11/19/35
+ // Pattern for vredand, vredor, vredxor: 4/4/4/6/10/18/34
+ // They are grouped together, so we use the worst-case vredsum latency.
+ // TODO: split vredand, vredor, vredxor into separate scheduling classe.
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -663,7 +861,27 @@ foreach mx = SchedMxListWRed in {
foreach sew = SchedSEWSet<mx, 0, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+ defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+ let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ // Latency for vfredmax.vs, vfredmin.vs: 12/12/15/21/33/57
+ // Latency for vfredusum.vs is slightly lower for e16/e32
+ // We use the worst-case
+ defvar VFRedLat = GetLMULValue<[12, 12, 12, 15, 21, 33, 57], mx>.c;
+ defvar VFRedOcc = GetLMULValue<[8, 8, 8, 8, 14, 20, 57], mx>.c;
+ let Latency = VFRedLat, ReleaseAtCycles = [VFRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -671,9 +889,20 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ // Compute latency based on SEW
+ defvar VFRedOV_FromLat = !cond(
+ !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 12, mx>.c,
+ !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c,
+ !eq(sew, 64) : ConstValueUntilLMULThenDouble<"M1", 12, mx>.c
+ );
+ defvar VFRedOV_FromOcc = !cond(
+ !eq(sew, 16) : GetLMULValue<[8, 8, 20, 24, 48, 96, 384], mx>.c,
+ !eq(sew, 32) : GetLMULValue<[8, 8, 8, 12, 24, 48, 192], mx>.c,
+ !eq(sew, 64) : GetLMULValue<[6, 6, 6, 6, 12, 24, 96], mx>.c
+ );
+ let Latency = VFRedOV_FromLat, ReleaseAtCycles = [VFRedOV_FromOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -681,8 +910,18 @@ foreach mx = SchedMxListFWRed in {
foreach sew = SchedSEWSet<mx, 1, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defvar VFRedOVLat = !cond(
+ !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 16, mx>.c,
+ !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 16, mx>.c,
+ );
+ defvar VFRedOVOcc = !cond(
+ !eq(sew, 16) : GetLMULValue<[11, 11, 27, 32, 64, 128, 512], mx>.c,
+ !eq(sew, 32) : GetLMULValue<[11, 11, 11, 16, 32, 64, 256], mx>.c,
+ );
+ let Latency = VFRedOVLat, ReleaseAtCycles = [VFRedOVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -690,49 +929,103 @@ foreach mx = SchedMxListFWRed in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = 4 in {
+ defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+ let Latency = 4, ReleaseAtCycles = [ConstValueUntilLMULThenDouble<"M2", 1, mx>.c] in {
+ defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+ }
- defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = 6, ReleaseAtCycles = [2] in {
+ defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ defvar VIotaLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VIotaOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VIotaLat, ReleaseAtCycles = [VIotaOcc] in {
+ defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 16. Vector Permutation Instructions
+// Slide
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+ // Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16
+ defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
- defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+ // Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17
+ defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c;
+ defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c;
+ let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+ // The following group slide up and down together, so we use the worst-case
+ // (slide down) for all.
+ let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+ }
}
-def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
-def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
-
-def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
-def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+// ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now
+// TODO: Split this into separate WriteRes for each MX
+let Latency = 6, ReleaseAtCycles = [6] in {
+ def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+}
-// Gather and Compress
-foreach mx = SchedMxList in {
- foreach sew = SchedSEWSet<mx>.val in {
- defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- }
+// ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now
+// TODO: Split this into separate WriteRes for each MX
+let Latency = 4, ReleaseAtCycles = [4] in {
+ def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+ def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+ def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
}
+// Integer LMUL Gather and Compress
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+ defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
+ defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+ defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c;
+ defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c;
+ let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+ }
+ // For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles)
+ defvar VRGatherEI16Lat = !if(!eq(sew, 8),
+ GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c,
+ GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c);
+ defvar VRGatherEI16Occ = !if(!eq(sew, 8),
+ GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c,
+ GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c);
+ let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+ }
+
+ defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c;
+ defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c;
+ let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+ }
+ }
}
// Others
@@ -740,6 +1033,10 @@ def : WriteRes<WriteCSR, [SMX60_IEU]>;
def : WriteRes<WriteNop, [SMX60_IEU]>;
def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>;
+// Give COPY instructions an execution resource.
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
//===----------------------------------------------------------------------===//
// Bypass and advance
def : ReadAdvance<ReadJmp, 0>;