diff options
author | Vitaly Buka <vitalybuka@google.com> | 2024-04-04 17:49:07 -0700 |
---|---|---|
committer | Vitaly Buka <vitalybuka@google.com> | 2024-04-04 17:49:07 -0700 |
commit | a724510541fc3272c9d4415c89b4549d8d149675 (patch) | |
tree | 5090317c71cf2ae73fb91a32f8dd6f8e037e4603 /llvm/lib/Target | |
parent | 2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310 (diff) | |
parent | b76eb1ddfbacda273b8e6a9940f1da6812fdc2e0 (diff) | |
download | llvm-a724510541fc3272c9d4415c89b4549d8d149675.zip llvm-a724510541fc3272c9d4415c89b4549d8d149675.tar.gz llvm-a724510541fc3272c9d4415c89b4549d8d149675.tar.bz2 |
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.rename-remove-traps-to-lower-builtin-hot
Created using spr 1.3.4
[skip ci]
Diffstat (limited to 'llvm/lib/Target')
49 files changed, 741 insertions, 413 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 6425aa9..3af427d 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -391,9 +391,18 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "equivalent when the immediate does " "not fit in the encoding.">; -def FeatureAddrLSLFast : SubtargetFeature< - "addr-lsl-fast", "HasAddrLSLFast", "true", - "Address operands with logical shift of up to 3 places are cheap">; +// Address operands with shift amount 2 or 3 are fast on all Arm chips except +// some old Apple cores (A7-A10?) which handle all shifts slowly. Cortex-A57 +// and derived designs through Cortex-X1 take an extra micro-op for shifts +// of 1 or 4. Other Arm chips handle all shifted operands at the same speed +// as unshifted operands. +// +// We don't try to model the behavior of the old Apple cores because new code +// targeting A7 is very unlikely to actually run on an A7. The Cortex cores +// are modeled by FeatureAddrLSLSlow14. +def FeatureAddrLSLSlow14 : SubtargetFeature< + "addr-lsl-slow-14", "HasAddrLSLSlow14", "true", + "Address operands with shift amount of 1 or 4 are slow">; def FeatureALULSLFast : SubtargetFeature< "alu-lsl-fast", "HasALULSLFast", "true", @@ -885,6 +894,7 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeatureBalanceFPOps, FeatureFuseAdrpAdd, FeatureFuseLiterals, + FeatureAddrLSLSlow14, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -903,6 +913,7 @@ def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureFuseLiterals, + FeatureAddrLSLSlow14, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -910,6 +921,7 @@ def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureAddrLSLSlow14, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -917,6 +929,7 @@ def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureAddrLSLSlow14, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -924,7 +937,7 @@ def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -934,7 +947,7 @@ def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -944,7 +957,7 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -956,7 +969,7 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -968,7 +981,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -979,7 +992,6 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -990,7 +1002,6 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", FeatureFuseAES, FeaturePostRAScheduler, FeatureCmpBccFusion, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureEnableSelectOptimize, @@ -1001,7 +1012,6 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720", FeatureFuseAES, FeaturePostRAScheduler, FeatureCmpBccFusion, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureEnableSelectOptimize, @@ -1012,7 +1022,6 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720", FeatureFuseAES, FeaturePostRAScheduler, FeatureCmpBccFusion, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureEnableSelectOptimize, @@ -1028,7 +1037,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1039,7 +1048,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1047,7 +1055,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", "Cortex-X3 ARM processors", [ - FeatureAddrLSLFast, FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureFuseAES, @@ -1057,7 +1064,6 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", "Cortex-X4 ARM processors", [ - FeatureAddrLSLFast, FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureFuseAES, @@ -1215,7 +1221,6 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureStorePairSuppress, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive]>; @@ -1234,7 +1239,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureStorePairSuppress, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureZCZeroing]>; @@ -1244,7 +1248,6 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureStorePairSuppress]>; @@ -1254,7 +1257,6 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeaturePredictableSelectIsExpensive, FeatureZCZeroing, FeatureStorePairSuppress, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureSlowSTRQro]>; @@ -1268,7 +1270,7 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1 "Neoverse N1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1278,7 +1280,6 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2 "Neoverse N2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1288,7 +1289,6 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne "Neoverse 512-TVB ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1298,7 +1298,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1 "Neoverse V1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, + FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1309,7 +1309,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 "Neoverse V2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -1321,7 +1320,6 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", FeaturePredictableSelectIsExpensive, FeatureZCZeroing, FeatureStorePairSuppress, - FeatureAddrLSLFast, FeatureALULSLFast]>; def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99", @@ -1381,7 +1379,6 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", FeaturePostRAScheduler, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureAggressiveFMA, FeatureArithmeticBccFusion, @@ -1397,7 +1394,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeaturePostRAScheduler, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureAggressiveFMA, FeatureArithmeticBccFusion, @@ -1414,7 +1410,6 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", FeaturePostRAScheduler, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureAddrLSLFast, FeatureALULSLFast, FeatureAggressiveFMA, FeatureArithmeticBccFusion, diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 4fa719a..f6ccd0e 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -268,13 +268,19 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { if (Sign->getZExtValue()) Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; - if (Flags == 0) - return; + uint64_t PAuthABIPlatform = -1; + if (const auto *PAP = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("aarch64-elf-pauthabi-platform"))) + PAuthABIPlatform = PAP->getZExtValue(); + uint64_t PAuthABIVersion = -1; + if (const auto *PAV = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("aarch64-elf-pauthabi-version"))) + PAuthABIVersion = PAV->getZExtValue(); // Emit a .note.gnu.property section with the flags. auto *TS = static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->emitNoteSection(Flags); + TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion); } void AArch64AsmPrinter::emitFunctionHeaderComment() { diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 163ed52..51bec36 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -462,7 +462,7 @@ private: SDValue &Offset, SDValue &SignExtend, SDValue &DoShift); bool isWorthFoldingALU(SDValue V, bool LSL = false) const; - bool isWorthFoldingAddr(SDValue V) const; + bool isWorthFoldingAddr(SDValue V, unsigned Size) const; bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, SDValue &Offset, SDValue &SignExtend); @@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) { /// Determine whether it is worth to fold V into an extended register addressing /// mode. -bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const { +bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const { // Trivial if we are optimizing for code size or if there is only // one use of the value. if (CurDAG->shouldOptForSize() || V.hasOneUse()) return true; - // If a subtarget has a fastpath LSL we can fold a logical shift into - // the addressing mode and save a cycle. - if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL && - isWorthFoldingSHL(V)) + + // If a subtarget has a slow shift, folding a shift into multiple loads + // costs additional micro-ops. + if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16)) + return false; + + // Check whether we're going to emit the address arithmetic anyway because + // it's used by a non-address operation. + if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V)) return true; - if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) { + if (V.getOpcode() == ISD::ADD) { const SDValue LHS = V.getOperand(0); const SDValue RHS = V.getOperand(1); if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) @@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, if (ShiftVal != 0 && ShiftVal != LegalShiftVal) return false; - return isWorthFoldingAddr(N); + return isWorthFoldingAddr(N, Size); } bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, @@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N); + bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && @@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0)); SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, MVT::i32); - if (isWorthFoldingAddr(LHS)) + if (isWorthFoldingAddr(LHS, Size)) return true; } @@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0)); SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, MVT::i32); - if (isWorthFoldingAddr(RHS)) + if (isWorthFoldingAddr(RHS, Size)) return true; } @@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N); + bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index d0c5e6b..22687b0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, return false; Shift = AArch64_AM::getShiftValue(Shift); if (!OptSize) { - if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast()) + if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14()) return false; if (avoidSlowSTRQ(MemI)) return false; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index a8f2c45..d4daf17 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( MI.getParent()->getParent()->getFunction().hasOptSize()) return true; - // It's better to avoid folding and recomputing shifts when we don't have a - // fastpath. - if (!STI.hasAddrLSLFast()) - return false; + // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as + // appropriate. // We have a fastpath, so folding a shift in and potentially computing it // many times may be beneficial. Check if this is only used in memory ops. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 33dba6a5..043f142 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1141,9 +1141,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .scalarize(1) .lower(); - getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); - getActionDefinitionsBuilder({G_FSHL, G_FSHR}) .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) .lower(); @@ -1191,8 +1188,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalarEltSameAsIf(always, 1, 0) .maxScalarEltSameAsIf(always, 1, 0); - // TODO: Vector types. - getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0)); + getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}) + .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8}) + .clampNumElements(0, v8s8, v16s8) + .clampNumElements(0, v4s16, v8s16) + .clampNumElements(0, v2s32, v4s32) + .clampMaxNumElements(0, s64, 2) + .moreElementsToNextPow2(0) + .lower(); // TODO: Libcall support for s128. // TODO: s16 should be legal with full FP16 support. diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index e1d6dd7..dc5383c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -58,8 +58,17 @@ void AArch64TargetStreamer::finish() { emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI); } -void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { - if (Flags == 0) +void AArch64TargetStreamer::emitNoteSection(unsigned Flags, + uint64_t PAuthABIPlatform, + uint64_t PAuthABIVersion) { + assert((PAuthABIPlatform == uint64_t(-1)) == + (PAuthABIVersion == uint64_t(-1))); + uint64_t DescSz = 0; + if (Flags != 0) + DescSz += 4 * 4; + if (PAuthABIPlatform != uint64_t(-1)) + DescSz += 4 + 4 + 8 * 2; + if (DescSz == 0) return; MCStreamer &OutStreamer = getStreamer(); @@ -80,15 +89,25 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { // Emit the note header. OutStreamer.emitValueToAlignment(Align(8)); OutStreamer.emitIntValue(4, 4); // data size for "GNU\0" - OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size + OutStreamer.emitIntValue(DescSz, 4); // Elf_Prop array size OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); OutStreamer.emitBytes(StringRef("GNU", 4)); // note name // Emit the PAC/BTI properties. - OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); - OutStreamer.emitIntValue(4, 4); // data size - OutStreamer.emitIntValue(Flags, 4); // data - OutStreamer.emitIntValue(0, 4); // pad + if (Flags != 0) { + OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); + OutStreamer.emitIntValue(4, 4); // data size + OutStreamer.emitIntValue(Flags, 4); // data + OutStreamer.emitIntValue(0, 4); // pad + } + + // Emit the PAuth ABI compatibility info + if (PAuthABIPlatform != uint64_t(-1)) { + OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_PAUTH, 4); + OutStreamer.emitIntValue(8 * 2, 4); // data size + OutStreamer.emitIntValue(PAuthABIPlatform, 8); + OutStreamer.emitIntValue(PAuthABIVersion, 8); + } OutStreamer.endSection(Nt); OutStreamer.switchSection(Cur); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index 7676d88..e8a9dc4 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -35,7 +35,8 @@ public: void emitCurrentConstantPool(); /// Callback used to implement the .note.gnu.property section. - void emitNoteSection(unsigned Flags); + void emitNoteSection(unsigned Flags, uint64_t PAuthABIPlatform = -1, + uint64_t PAuthABIVersion = -1); /// Callback used to implement the .inst directive. virtual void emitInst(uint32_t Inst); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 9083150..1114a8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1086,7 +1086,7 @@ void SplitPtrStructs::processConditionals() { if (MaybeRsrc) for (Value *V : Seen) FoundRsrcs[cast<Instruction>(V)] = NewRsrc; - } else if (auto *SI = dyn_cast<SelectInst>(I)) { + } else if (isa<SelectInst>(I)) { if (MaybeRsrc) { ConditionalTemps.push_back(cast<Instruction>(Rsrc)); Rsrc->replaceAllUsesWith(*MaybeRsrc); @@ -1777,8 +1777,8 @@ void SplitPtrStructs::processFunction(Function &F) { Originals.push_back(&I); for (Instruction *I : Originals) { auto [Rsrc, Off] = visit(I); - assert((Rsrc && Off) || - (!Rsrc && !Off) && "Can't have a resource but no offset"); + assert(((Rsrc && Off) || (!Rsrc && !Off)) && + "Can't have a resource but no offset"); if (Rsrc) RsrcParts[I] = Rsrc; if (Off) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 294fc68..3866723 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4627,10 +4627,15 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, if (Src1Idx >= 0) { const MCOperand &Src1 = Inst.getOperand(Src1Idx); const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (Src1.isImm() || - (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]); - Error(Op.getStartLoc(), "invalid operand for instruction"); + if (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI)) { + auto Reg = mc2PseudoReg(Inst.getOperand(Src1Idx).getReg()); + SMLoc S = getRegLoc(Reg, Operands); + Error(S, "invalid operand for instruction"); + return false; + } + if (Src1.isImm()) { + Error(getInstLoc(Operands), + "src1 immediate operand invalid for instruction"); return false; } } diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td index f4f02d2..0541f0f 100644 --- a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td @@ -112,7 +112,7 @@ class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> : lds.Mnemonic # asm, ins, lds.is_direct>, - SIMCInstr <lds.Mnemonic, subtarget> { + SIMCInstr <lds.PseudoInstr, subtarget> { let isPseudo = 0; let isCodeGenOnly = 0; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index e944dde..0773ef7 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1192,7 +1192,7 @@ def : GCNPat < class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic, bit hasGDS = true> - : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> { + : DS_Real<ps, opName>, SIMCInstr <ps.PseudoInstr, ef> { let Inst{7-0} = !if(ps.has_offset0, offset0, 0); let Inst{15-8} = !if(ps.has_offset1, offset1, 0); @@ -1557,7 +1557,7 @@ defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>; class DS_Real_vi <bits<8> op, DS_Pseudo ps> : DS_Real <ps>, - SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> { + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { let AssemblerPredicate = isGFX8GFX9; let DecoderNamespace = "GFX8"; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index d017ec4..27d5616 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2558,7 +2558,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op, multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = ""> : + string alias = name> : VFLAT_Real_Base_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; } @@ -2581,7 +2581,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = ""> : + string alias = name> : VGLOBAL_Real_AllAddr_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 2762190..bb499c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -708,9 +708,6 @@ public: WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, bool FlushVmCnt); - bool generateWaitcntBlockEnd(MachineBasicBlock &Block, - WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); bool generateWaitcnt(AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, @@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, OldWaitcntInstr); } -// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the -// end of the given block if needed. -bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, - WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { - AMDGPU::Waitcnt Wait; - - unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT); - unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT); - unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT); - - if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0) - return false; - - if (LoadCntPending != 0) - Wait.LoadCnt = 0; - if (SampleCntPending != 0) - Wait.SampleCnt = 0; - if (BvhCntPending != 0) - Wait.BvhCnt = 0; - - return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, - OldWaitcntInstr); -} - bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It, MachineBasicBlock &Block, @@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ++Iter; } + // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if + // needed. + AMDGPU::Waitcnt Wait; if (Block.getFirstTerminator() == Block.end() && - isPreheaderToFlush(Block, ScoreBrackets)) - Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + isPreheaderToFlush(Block, ScoreBrackets)) { + if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) + Wait.LoadCnt = 0; + if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) + Wait.SampleCnt = 0; + if (ScoreBrackets.hasPendingEvent(BVH_CNT)) + Wait.BvhCnt = 0; + } + + // Combine or remove any redundant waitcnts at the end of the block. + Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, + OldWaitcntInstr); return Modified; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 1694436..f1afbcc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2268,7 +2268,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret; field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret; field Operand Src0ModVOP3DPP = getSrcModDPP<Src0VT>.ret; - field Operand Src1ModVOP3DPP = getSrcModDPP<Src1VT>.ret; + field Operand Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT>.ret; field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT>.ret; field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret; field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index d34ee34..0b7d45e 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1972,7 +1972,7 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx11 : SOP1_Real<op, ps, name>, - Select_gfx11<ps.Mnemonic>; + Select_gfx11<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>; } @@ -1980,14 +1980,14 @@ multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> { multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx12 : SOP1_Real<op, ps, name>, - Select_gfx12<ps.Mnemonic>; + Select_gfx12<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>; } multiclass SOP1_M0_Real_gfx12<bits<8> op> { def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, - Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic> { + Select_gfx12<!cast<SOP1_Pseudo>(NAME).PseudoInstr> { let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0 } } @@ -1995,7 +1995,7 @@ multiclass SOP1_M0_Real_gfx12<bits<8> op> { multiclass SOP1_IMM_Real_gfx12<bits<8> op> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx12 : SOP1_Real<op, ps>, - Select_gfx12<ps.Mnemonic>; + Select_gfx12<ps.PseudoInstr>; } multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> : @@ -2106,7 +2106,7 @@ defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>; multiclass SOP1_Real_gfx10<bits<8> op> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx10 : SOP1_Real<op, ps>, - Select_gfx10<ps.Mnemonic>; + Select_gfx10<ps.PseudoInstr>; } multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> : @@ -2139,7 +2139,7 @@ defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx6_gfx7 : SOP1_Real<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>; + Select_gfx6_gfx7<ps.PseudoInstr>; } multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : @@ -2205,7 +2205,7 @@ defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> { defvar ps = !cast<SOP2_Pseudo>(NAME); def _gfx12 : SOP2_Real32<op, ps, name>, - Select_gfx12<ps.Mnemonic>; + Select_gfx12<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>; } @@ -2222,7 +2222,7 @@ defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>; multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> { defvar ps = !cast<SOP2_Pseudo>(NAME); def _gfx11 : SOP2_Real32<op, ps, name>, - Select_gfx11<ps.Mnemonic>; + Select_gfx11<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>; } @@ -2283,12 +2283,12 @@ defm S_MUL_U64 : SOP2_Real_gfx12<0x055>; multiclass SOP2_Real_FMAK_gfx12<bits<7> op> { def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, - Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>; + Select_gfx12<!cast<SOP2_Pseudo>(NAME).PseudoInstr>; } multiclass SOP2_Real_FMAK_gfx11<bits<7> op> { def _gfx11 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, - Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>; + Select_gfx11<!cast<SOP2_Pseudo>(NAME).PseudoInstr>; } multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> : @@ -2325,7 +2325,7 @@ defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">; multiclass SOP2_Real_gfx10<bits<7> op> { defvar ps = !cast<SOP2_Pseudo>(NAME); def _gfx10 : SOP2_Real32<op, ps>, - Select_gfx10<ps.Mnemonic>; + Select_gfx10<ps.PseudoInstr>; } multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> : @@ -2348,7 +2348,7 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOP2_Pseudo>(NAME); def _gfx6_gfx7 : SOP2_Real32<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>; + Select_gfx6_gfx7<ps.PseudoInstr>; } multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : @@ -2410,24 +2410,24 @@ defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> { defvar ps = !cast<SOPK_Pseudo>(NAME); def _gfx12 : SOPK_Real32<op, ps, name>, - Select_gfx12<ps.Mnemonic>; + Select_gfx12<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>; } multiclass SOPK_Real32_gfx11<bits<5> op> { def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>; } multiclass SOPK_Real64_gfx12<bits<5> op> { def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + Select_gfx12<!cast<SOPK_Pseudo>(NAME).PseudoInstr>; } multiclass SOPK_Real64_gfx11<bits<5> op> { def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>; } multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> : @@ -2454,13 +2454,13 @@ defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; multiclass SOPK_Real32_gfx10<bits<5> op> { defvar ps = !cast<SOPK_Pseudo>(NAME); def _gfx10 : SOPK_Real32<op, ps>, - Select_gfx10<ps.Mnemonic>; + Select_gfx10<ps.PseudoInstr>; } multiclass SOPK_Real64_gfx10<bits<5> op> { defvar ps = !cast<SOPK_Pseudo>(NAME); def _gfx10 : SOPK_Real64<op, ps>, - Select_gfx10<ps.Mnemonic>; + Select_gfx10<ps.PseudoInstr>; } multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> : @@ -2485,13 +2485,13 @@ defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> { defvar ps = !cast<SOPK_Pseudo>(NAME); def _gfx6_gfx7 : SOPK_Real32<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>; + Select_gfx6_gfx7<ps.PseudoInstr>; } multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> { defvar ps = !cast<SOPK_Pseudo>(NAME); def _gfx6_gfx7 : SOPK_Real64<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>; + Select_gfx6_gfx7<ps.PseudoInstr>; } multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> : @@ -2539,7 +2539,7 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx12 : SOPP_Real_32<op, ps, name>, - Select_gfx12<ps.Mnemonic>; + Select_gfx12<ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>; } @@ -2564,7 +2564,7 @@ defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>; multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx11 : SOPP_Real_32<op, ps, name>, - Select_gfx11<ps.Mnemonic>, + Select_gfx11<ps.PseudoInstr>, SOPPRelaxTable<0, ps.KeyName, "_gfx11">; if !ne(ps.Mnemonic, name) then def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>; @@ -2572,13 +2572,13 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> { multiclass SOPP_Real_64_gfx12<bits<7> op> { def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, - Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select_gfx12<!cast<SOPP_Pseudo>(NAME).PseudoInstr>, SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">; } multiclass SOPP_Real_64_gfx11<bits<7> op> { def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, - Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select_gfx11<!cast<SOPP_Pseudo>(NAME).PseudoInstr>, SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">; } @@ -2654,21 +2654,21 @@ defm S_SINGLEUSE_VDST : SOPP_Real_32_gfx11_gfx12<0x013>; multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>, - Select_gfx6_gfx7<ps.Mnemonic>, + Select_gfx6_gfx7<ps.PseudoInstr>, SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">; } multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _vi : SOPP_Real_32<op, ps>, - Select_vi<ps.Mnemonic>, + Select_vi<ps.PseudoInstr>, SOPPRelaxTable<0, ps.KeyName, "_vi">; } multiclass SOPP_Real_32_gfx10<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx10 : SOPP_Real_32<op, ps>, - Select_gfx10<ps.Mnemonic>, + Select_gfx10<ps.PseudoInstr>, SOPPRelaxTable<0, ps.KeyName, "_gfx10">; } @@ -2691,21 +2691,21 @@ multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> : multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx6_gfx7 : SOPP_Real_64<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>, + Select_gfx6_gfx7<ps.PseudoInstr>, SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">; } multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _vi : SOPP_Real_64<op, ps>, - Select_vi<ps.Mnemonic>, + Select_vi<ps.PseudoInstr>, SOPPRelaxTable<1, ps.KeyName, "_vi">; } multiclass SOPP_Real_64_gfx10<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); def _gfx10 : SOPP_Real_64<op, ps>, - Select_gfx10<ps.Mnemonic>, + Select_gfx10<ps.PseudoInstr>, SOPPRelaxTable<1, ps.KeyName, "_gfx10">; } @@ -2771,12 +2771,12 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_ multiclass SOPC_Real_gfx12<bits<7> op> { def _gfx12 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, - Select_gfx12<!cast<SOPC_Pseudo>(NAME).Mnemonic>; + Select_gfx12<!cast<SOPC_Pseudo>(NAME).PseudoInstr>; } multiclass SOPC_Real_gfx11<bits<7> op> { def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, - Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>; + Select_gfx11<!cast<SOPC_Pseudo>(NAME).PseudoInstr>; } multiclass SOPC_Real_gfx11_gfx12<bits<7> op> : @@ -2826,19 +2826,19 @@ defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>; multiclass SOPC_Real_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOPC_Pseudo>(NAME); def _gfx6_gfx7 : SOPC_Real<op, ps>, - Select_gfx6_gfx7<ps.Mnemonic>; + Select_gfx6_gfx7<ps.PseudoInstr>; } multiclass SOPC_Real_gfx8_gfx9<bits<7> op> { defvar ps = !cast<SOPC_Pseudo>(NAME); def _vi : SOPC_Real<op, ps>, - Select_vi<ps.Mnemonic>; + Select_vi<ps.PseudoInstr>; } multiclass SOPC_Real_gfx10<bits<7> op> { defvar ps = !cast<SOPC_Pseudo>(NAME); def _gfx10 : SOPC_Real<op, ps>, - Select_gfx10<ps.Mnemonic>; + Select_gfx10<ps.PseudoInstr>; } multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> : @@ -2878,15 +2878,15 @@ defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x13>; class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> : SOP1_Real<op, ps>, - Select_vi<ps.Mnemonic>; + Select_vi<ps.PseudoInstr>; class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> : SOP2_Real32<op, ps>, - Select_vi<ps.Mnemonic>; + Select_vi<ps.PseudoInstr>; class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> : SOPK_Real32<op, ps>, - Select_vi<ps.Mnemonic>; + Select_vi<ps.PseudoInstr>; def S_MOV_B32_vi : SOP1_Real_vi <0x00, S_MOV_B32>; def S_MOV_B64_vi : SOP1_Real_vi <0x01, S_MOV_B64>; @@ -3007,7 +3007,7 @@ def S_GETREG_B32_vi : SOPK_Real_vi <0x11, S_GETREG_B32>; def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>; //def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>, - Select_vi<S_SETREG_IMM32_B32.Mnemonic>; + Select_vi<S_SETREG_IMM32_B32.PseudoInstr>; def S_CALL_B64_vi : SOPK_Real_vi <0x15, S_CALL_B64>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5d44396..4e00744 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -182,6 +182,8 @@ unsigned getAMDHSACodeObjectVersion(unsigned ABIVersion) { return 4; case ELF::ELFABIVERSION_AMDGPU_HSA_V5: return 5; + case ELF::ELFABIVERSION_AMDGPU_HSA_V6: + return 6; default: return getDefaultAMDHSACodeObjectVersion(); } @@ -496,9 +498,7 @@ bool isVOPC64DPP(unsigned Opc) { return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc); } -bool isVOPCAsmOnly(unsigned Opc) { - return isVOPCAsmOnlyOpcodeHelper(Opc) || isVOP3CAsmOnlyOpcodeHelper(Opc); -} +bool isVOPCAsmOnly(unsigned Opc) { return isVOPCAsmOnlyOpcodeHelper(Opc); } bool getMAIIsDGEMM(unsigned Opc) { const MAIInstInfo *Info = getMAIInstInfoHelper(Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index f136a43..c001c5d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -503,6 +503,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> { dpp8:$dpp8, Dpp8FI:$fi); let Src2Mod = FP32InputMods; // dummy unused modifiers let Src2RC64 = VGPRSrc_32; // stub argument + let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret; } def VOP_MAC_F32 : VOP_MAC <f32>; let HasExtDPP = 0, HasExt32BitDPP = 0 in @@ -618,7 +619,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> { let AsmVOP3Base = "$vdst, $src0_modifiers, $src1_modifiers, $src2"; let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC64:$vdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -652,7 +653,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> { dpp8:$dpp8, Dpp8FI:$fi); let Src0ModVOP3DPP = FPVRegInputMods; - let Src1ModVOP3DPP = FPVRegInputMods; + let Src1ModVOP3DPP = FP32VCSrcInputMods; let HasExt = 1; let HasExtDPP = 1; @@ -662,7 +663,17 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> { } def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>; -def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>; +def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> { + let IsTrue16 = 1; + let DstRC64 = getVALUDstForVT<DstVT>.ret; + + let Src0Mod = getSrcMod<f16>.ret; + let Src1Mod = getSrcMod<f16>.ret; + + let Src0VOP3DPP = VGPRSrc_32; + let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret; + let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret; +} def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> { let Outs32 = (outs SReg_32:$vdst); @@ -703,7 +714,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { //===----------------------------------------------------------------------===// let SubtargetPredicate = isGFX11Plus in -defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>; +defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>; defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">; let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 022fb7c..0b3a3d5 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -772,7 +772,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VRegSrc_32:$src1); + dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1); let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), (ins))); let AsmVOP3Base = "$sdst, $src0_modifiers, $src1"; @@ -1377,31 +1377,9 @@ multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> { } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); - defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } - defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; - def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } } } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } @@ -1472,35 +1450,9 @@ multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName, if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); - defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } - defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; - def _e64_dpp8_w32#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } } } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a6272e9..60e91c7 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1680,7 +1680,6 @@ class AsmOnlyInfoTable <string Format, string Class>: GenericTable { } def VOPCAsmOnlyInfoTable : AsmOnlyInfoTable <"VOPC", "VOPC_DPPe_Common">; -def VOP3CAsmOnlyInfoTable : AsmOnlyInfoTable <"VOP3C", "VOP3_DPPe_Common_Base">; def VOPTrue16Table : GenericTable { let FilterClass = "VOP_Pseudo"; diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index 9c29acb..bef7607 100644 --- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -153,15 +153,15 @@ class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>; class LWPC_ENC : PCREL19_FM<OPCODE2_LWPC>; -class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>; -class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>; +class MAX_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>; +class MAX_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>; class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>; class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>; class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>; class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>; -class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>; -class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>; +class MINA_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>; +class MINA_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>; class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>; class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 7f35107..38c1f9868 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -139,20 +139,21 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .clampScalar(0, s32, sXLen) .minScalarSameAs(1, 0); + auto &ExtActions = + getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) + .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), + typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))); if (ST.is64Bit()) { - getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) - .legalFor({{sXLen, s32}}) - .maxScalar(0, sXLen); - + ExtActions.legalFor({{sXLen, s32}}); getActionDefinitionsBuilder(G_SEXT_INREG) .customFor({sXLen}) .maxScalar(0, sXLen) .lower(); } else { - getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}).maxScalar(0, sXLen); - getActionDefinitionsBuilder(G_SEXT_INREG).maxScalar(0, sXLen).lower(); } + ExtActions.customIf(typeIsLegalBoolVec(1, BoolVecTys, ST)) + .maxScalar(0, sXLen); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { @@ -235,7 +236,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder(G_ICMP) .legalFor({{sXLen, sXLen}, {sXLen, p0}}) - .widenScalarToNextPow2(1) + .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), + typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))) + .widenScalarOrEltToNextPow2OrMinSize(1, 8) .clampScalar(1, sXLen, sXLen) .clampScalar(0, sXLen, sXLen); @@ -418,6 +421,29 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .clampScalar(0, sXLen, sXLen) .customFor({sXLen}); + auto &SplatActions = + getActionDefinitionsBuilder(G_SPLAT_VECTOR) + .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), + typeIs(1, sXLen))) + .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), typeIs(1, s1))); + // Handle case of s64 element vectors on RV32. If the subtarget does not have + // f64, then try to lower it to G_SPLAT_VECTOR_SPLIT_64_VL. If the subtarget + // does have f64, then we don't know whether the type is an f64 or an i64, + // so mark the G_SPLAT_VECTOR as legal and decide later what to do with it, + // depending on how the instructions it consumes are legalized. They are not + // legalized yet since legalization is in reverse postorder, so we cannot + // make the decision at this moment. + if (XLen == 32) { + if (ST.hasVInstructionsF64() && ST.hasStdExtD()) + SplatActions.legalIf(all( + typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64))); + else if (ST.hasVInstructionsI64()) + SplatActions.customIf(all( + typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64))); + } + + SplatActions.clampScalar(1, sXLen, sXLen); + getLegacyLegalizerInfo().computeTables(); } @@ -576,7 +602,145 @@ bool RISCVLegalizerInfo::legalizeVScale(MachineInstr &MI, auto VScale = MIB.buildLShr(XLenTy, VLENB, MIB.buildConstant(XLenTy, 3)); MIB.buildMul(Dst, VScale, MIB.buildConstant(XLenTy, Val)); } + MI.eraseFromParent(); + return true; +} + +// Custom-lower extensions from mask vectors by using a vselect either with 1 +// for zero/any-extension or -1 for sign-extension: +// (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0) +// Note that any-extension is lowered identically to zero-extension. +bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI, + MachineIRBuilder &MIB) const { + + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_SEXT || + Opc == TargetOpcode::G_ANYEXT); + + MachineRegisterInfo &MRI = *MIB.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + LLT DstTy = MRI.getType(Dst); + int64_t ExtTrueVal = Opc == TargetOpcode::G_SEXT ? -1 : 1; + LLT DstEltTy = DstTy.getElementType(); + auto SplatZero = MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, 0)); + auto SplatTrue = + MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, ExtTrueVal)); + MIB.buildSelect(Dst, Src, SplatTrue, SplatZero); + + MI.eraseFromParent(); + return true; +} + +/// Return the type of the mask type suitable for masking the provided +/// vector type. This is simply an i1 element type vector of the same +/// (possibly scalable) length. +static LLT getMaskTypeFor(LLT VecTy) { + assert(VecTy.isVector()); + ElementCount EC = VecTy.getElementCount(); + return LLT::vector(EC, LLT::scalar(1)); +} + +/// Creates an all ones mask suitable for masking a vector of type VecTy with +/// vector length VL. +static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL, + MachineIRBuilder &MIB, + MachineRegisterInfo &MRI) { + LLT MaskTy = getMaskTypeFor(VecTy); + return MIB.buildInstr(RISCV::G_VMSET_VL, {MaskTy}, {VL}); +} + +/// Gets the two common "VL" operands: an all-ones mask and the vector length. +/// VecTy is a scalable vector type. +static std::pair<MachineInstrBuilder, Register> +buildDefaultVLOps(const DstOp &Dst, MachineIRBuilder &MIB, + MachineRegisterInfo &MRI) { + LLT VecTy = Dst.getLLTTy(MRI); + assert(VecTy.isScalableVector() && "Expecting scalable container type"); + Register VL(RISCV::X0); + MachineInstrBuilder Mask = buildAllOnesMask(VecTy, VL, MIB, MRI); + return {Mask, VL}; +} + +static MachineInstrBuilder +buildSplatPartsS64WithVL(const DstOp &Dst, const SrcOp &Passthru, Register Lo, + Register Hi, Register VL, MachineIRBuilder &MIB, + MachineRegisterInfo &MRI) { + // TODO: If the Hi bits of the splat are undefined, then it's fine to just + // splat Lo even if it might be sign extended. I don't think we have + // introduced a case where we're build a s64 where the upper bits are undef + // yet. + + // Fall back to a stack store and stride x0 vector load. + // TODO: need to lower G_SPLAT_VECTOR_SPLIT_I64. This is done in + // preprocessDAG in SDAG. + return MIB.buildInstr(RISCV::G_SPLAT_VECTOR_SPLIT_I64_VL, {Dst}, + {Passthru, Lo, Hi, VL}); +} + +static MachineInstrBuilder +buildSplatSplitS64WithVL(const DstOp &Dst, const SrcOp &Passthru, + const SrcOp &Scalar, Register VL, + MachineIRBuilder &MIB, MachineRegisterInfo &MRI) { + assert(Scalar.getLLTTy(MRI) == LLT::scalar(64) && "Unexpected VecTy!"); + auto Unmerge = MIB.buildUnmerge(LLT::scalar(32), Scalar); + return buildSplatPartsS64WithVL(Dst, Passthru, Unmerge.getReg(0), + Unmerge.getReg(1), VL, MIB, MRI); +} + +// Lower splats of s1 types to G_ICMP. For each mask vector type, we have a +// legal equivalently-sized i8 type, so we can use that as a go-between. +// Splats of s1 types that have constant value can be legalized as VMSET_VL or +// VMCLR_VL. +bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI, + MachineIRBuilder &MIB) const { + assert(MI.getOpcode() == TargetOpcode::G_SPLAT_VECTOR); + + MachineRegisterInfo &MRI = *MIB.getMRI(); + + Register Dst = MI.getOperand(0).getReg(); + Register SplatVal = MI.getOperand(1).getReg(); + + LLT VecTy = MRI.getType(Dst); + LLT XLenTy(STI.getXLenVT()); + + // Handle case of s64 element vectors on rv32 + if (XLenTy.getSizeInBits() == 32 && + VecTy.getElementType().getSizeInBits() == 64) { + auto [_, VL] = buildDefaultVLOps(Dst, MIB, MRI); + buildSplatSplitS64WithVL(Dst, MIB.buildUndef(VecTy), SplatVal, VL, MIB, + MRI); + MI.eraseFromParent(); + return true; + } + + // All-zeros or all-ones splats are handled specially. + MachineInstr &SplatValMI = *MRI.getVRegDef(SplatVal); + if (isAllOnesOrAllOnesSplat(SplatValMI, MRI)) { + auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second; + MIB.buildInstr(RISCV::G_VMSET_VL, {Dst}, {VL}); + MI.eraseFromParent(); + return true; + } + if (isNullOrNullSplat(SplatValMI, MRI)) { + auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second; + MIB.buildInstr(RISCV::G_VMCLR_VL, {Dst}, {VL}); + MI.eraseFromParent(); + return true; + } + // Handle non-constant mask splat (i.e. not sure if it's all zeros or all + // ones) by promoting it to an s8 splat. + LLT InterEltTy = LLT::scalar(8); + LLT InterTy = VecTy.changeElementType(InterEltTy); + auto ZExtSplatVal = MIB.buildZExt(InterEltTy, SplatVal); + auto And = + MIB.buildAnd(InterEltTy, ZExtSplatVal, MIB.buildConstant(InterEltTy, 1)); + auto LHS = MIB.buildSplatVector(InterTy, And); + auto ZeroSplat = + MIB.buildSplatVector(InterTy, MIB.buildConstant(InterEltTy, 0)); + MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, LHS, ZeroSplat); MI.eraseFromParent(); return true; } @@ -640,6 +804,12 @@ bool RISCVLegalizerInfo::legalizeCustom( return legalizeVAStart(MI, MIRBuilder); case TargetOpcode::G_VSCALE: return legalizeVScale(MI, MIRBuilder); + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ANYEXT: + return legalizeExt(MI, MIRBuilder); + case TargetOpcode::G_SPLAT_VECTOR: + return legalizeSplatVector(MI, MIRBuilder); } llvm_unreachable("expected switch to return"); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h index e2a98c8..5bb1e7a 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h @@ -43,6 +43,8 @@ private: bool legalizeVAStart(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const; bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const; + bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const; + bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const; }; } // end namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp index 888bcc4..86e4434 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp @@ -290,16 +290,7 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (Opc) { case TargetOpcode::G_ADD: - case TargetOpcode::G_SUB: { - if (MRI.getType(MI.getOperand(0).getReg()).isVector()) { - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - return getInstructionMapping( - DefaultMappingID, /*Cost=*/1, - getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue()), - NumOperands); - } - } - LLVM_FALLTHROUGH; + case TargetOpcode::G_SUB: case TargetOpcode::G_SHL: case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: @@ -320,14 +311,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_PTRTOINT: case TargetOpcode::G_INTTOPTR: - case TargetOpcode::G_TRUNC: - case TargetOpcode::G_ANYEXT: - case TargetOpcode::G_SEXT: - case TargetOpcode::G_ZEXT: - case TargetOpcode::G_SEXTLOAD: - case TargetOpcode::G_ZEXTLOAD: - return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping, - NumOperands); case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: @@ -338,25 +321,48 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - return getInstructionMapping(DefaultMappingID, /*Cost=*/1, - getFPValueMapping(Ty.getSizeInBits()), - NumOperands); + TypeSize Size = Ty.getSizeInBits(); + + const ValueMapping *Mapping; + if (Ty.isVector()) + Mapping = getVRBValueMapping(Size.getKnownMinValue()); + else if (isPreISelGenericFloatingPointOpcode(Opc)) + Mapping = getFPValueMapping(Size.getFixedValue()); + else + Mapping = GPRValueMapping; + +#ifndef NDEBUG + // Make sure all the operands are using similar size and type. + for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { + LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); + assert(Ty.isVector() == OpTy.isVector() && + "Operand has incompatible type"); + // Don't check size for GPR. + if (OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc)) + assert(Size == OpTy.getSizeInBits() && "Operand has incompatible size"); + } +#endif // End NDEBUG + + return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands); } + case TargetOpcode::G_SEXTLOAD: + case TargetOpcode::G_ZEXTLOAD: + return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping, + NumOperands); case TargetOpcode::G_IMPLICIT_DEF: { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); - uint64_t DstMinSize = DstTy.getSizeInBits().getKnownMinValue(); + unsigned DstMinSize = DstTy.getSizeInBits().getKnownMinValue(); auto Mapping = GPRValueMapping; // FIXME: May need to do a better job determining when to use FPRB. // For example, the look through COPY case: // %0:_(s32) = G_IMPLICIT_DEF // %1:_(s32) = COPY %0 // $f10_d = COPY %1(s32) - if (anyUseOnlyUseFP(Dst, MRI, TRI)) - Mapping = getFPValueMapping(DstMinSize); - if (DstTy.isVector()) Mapping = getVRBValueMapping(DstMinSize); + else if (anyUseOnlyUseFP(Dst, MRI, TRI)) + Mapping = getFPValueMapping(DstMinSize); return getInstructionMapping(DefaultMappingID, /*Cost=*/1, Mapping, NumOperands); @@ -529,7 +535,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (!Ty.isValid()) continue; - if (isPreISelGenericFloatingPointOpcode(Opc)) + if (Ty.isVector()) + OpdsMapping[Idx] = + getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue()); + else if (isPreISelGenericFloatingPointOpcode(Opc)) OpdsMapping[Idx] = getFPValueMapping(Ty.getSizeInBits()); else OpdsMapping[Idx] = GPRValueMapping; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 55ba494..f99dc0b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3287,24 +3287,24 @@ bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits, } bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) { - // Truncates are custom lowered during legalization. - auto IsTrunc = [this](SDValue N) { - if (N->getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL) + auto IsExtOrTrunc = [](SDValue N) { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + // There's no passthru on these _VL nodes so any VL/mask is ok, since any + // inactive elements will be undef. + case RISCVISD::TRUNCATE_VECTOR_VL: + case RISCVISD::VSEXT_VL: + case RISCVISD::VZEXT_VL: + return true; + default: return false; - SDValue VL; - selectVLOp(N->getOperand(2), VL); - // Any vmset_vl is ok, since any bits past VL are undefined and we can - // assume they are set. - return N->getOperand(1).getOpcode() == RISCVISD::VMSET_VL && - isa<ConstantSDNode>(VL) && - cast<ConstantSDNode>(VL)->getSExtValue() == RISCV::VLMaxSentinel; + } }; - // We can have multiple nested truncates, so unravel them all if needed. - while (N->getOpcode() == ISD::SIGN_EXTEND || - N->getOpcode() == ISD::ZERO_EXTEND || IsTrunc(N)) { - if (!N.hasOneUse() || - N.getValueType().getSizeInBits().getKnownMinValue() < 8) + // We can have multiple nested nodes, so unravel them all if needed. + while (IsExtOrTrunc(N)) { + if (!N.hasOneUse() || N.getScalarValueSizeInBits() < 8) return false; N = N->getOperand(0); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ee83f9d..279d8a4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21115,12 +21115,10 @@ void RVVArgDispatcher::constructArgInfos(ArrayRef<Type *> TypeList) { RegisterVT.getVectorElementType() == MVT::i1) { RVVArgInfos.push_back({1, RegisterVT, true}); FirstVMaskAssigned = true; - } else { - RVVArgInfos.push_back({1, RegisterVT, false}); + --NumRegs; } - RVVArgInfos.insert(RVVArgInfos.end(), --NumRegs, - {1, RegisterVT, false}); + RVVArgInfos.insert(RVVArgInfos.end(), NumRegs, {1, RegisterVT, false}); } } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td index 54e22d6..ba40662 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td +++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td @@ -32,3 +32,28 @@ def G_READ_VLENB : RISCVGenericInstruction { let hasSideEffects = false; } def : GINodeEquiv<G_READ_VLENB, riscv_read_vlenb>; + +// Pseudo equivalent to a RISCVISD::VMCLR_VL +def G_VMCLR_VL : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$vl); + let hasSideEffects = false; +} +def : GINodeEquiv<G_VMCLR_VL, riscv_vmclr_vl>; + +// Pseudo equivalent to a RISCVISD::VMSET_VL +def G_VMSET_VL : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$vl); + let hasSideEffects = false; +} +def : GINodeEquiv<G_VMSET_VL, riscv_vmset_vl>; + +// Pseudo equivalent to a RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL. There is no +// record to mark as equivalent to using GINodeEquiv because it gets lowered +// before instruction selection. +def G_SPLAT_VECTOR_SPLIT_I64_VL : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$passthru, type1:$hi, type1:$lo, type2:$vl); + let hasSideEffects = false; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index cc44092..73d52d5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -387,6 +387,9 @@ def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVT<3, XLenVT>]>; def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>; def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>; +def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C), + [(riscv_sext_vl node:$A, node:$B, node:$C), + (riscv_zext_vl node:$A, node:$B, node:$C)]>; def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL", SDTypeProfile<1, 3, [SDTCisVec<0>, @@ -535,6 +538,11 @@ def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), return N->hasOneUse(); }]>; +def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_ext_vl node:$A, node:$B, node:$C), [{ + return N->hasOneUse(); +}]>; + def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), (riscv_fpextend_vl node:$A, node:$B, node:$C), [{ return N->hasOneUse(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index 51a7a0a1..c1facc79 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -630,6 +630,19 @@ foreach vtiToWti = AllWidenableIntVectors in { (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl + (wti.Vector (riscv_zext_vl_oneuse + (vti.Vector vti.RegClass:$rs2), + (vti.Mask V0), VLOpFrag)), + (wti.Vector (riscv_ext_vl_oneuse + (vti.Vector vti.RegClass:$rs1), + (vti.Mask V0), VLOpFrag)), + (wti.Vector wti.RegClass:$merge), + (vti.Mask V0), VLOpFrag), + (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK") + wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + + def : Pat<(riscv_shl_vl (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))), (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))), (wti.Vector wti.RegClass:$merge), @@ -639,6 +652,17 @@ foreach vtiToWti = AllWidenableIntVectors in { (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl + (wti.Vector (riscv_zext_vl_oneuse + (vti.Vector vti.RegClass:$rs2), + (vti.Mask V0), VLOpFrag)), + (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))), + (wti.Vector wti.RegClass:$merge), + (vti.Mask V0), VLOpFrag), + (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK") + wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + + def : Pat<(riscv_shl_vl (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))), (wti.Vector (SplatPat_uimm5 uimm5:$rs1)), (wti.Vector wti.RegClass:$merge), @@ -647,6 +671,17 @@ foreach vtiToWti = AllWidenableIntVectors in { wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_shl_vl + (wti.Vector (riscv_zext_vl_oneuse + (vti.Vector vti.RegClass:$rs2), + (vti.Mask V0), VLOpFrag)), + (wti.Vector (SplatPat_uimm5 uimm5:$rs1)), + (wti.Vector wti.RegClass:$merge), + (vti.Mask V0), VLOpFrag), + (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK") + wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_vwsll_vl (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$rs1), diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index ba108912..85f8f5f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -254,6 +254,7 @@ public: const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; + bool isTargetAndroid() const { return getTargetTriple().isAndroid(); } bool isTargetFuchsia() const { return getTargetTriple().isOSFuchsia(); } bool useConstantPoolForLargeInts() const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 38304ff..aeec063 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -245,6 +245,10 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, return TTI::TCC_Free; } +bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { + return ST->hasVInstructions(); +} + TargetTransformInfo::PopcntSupportKind RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); @@ -861,9 +865,14 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } // TODO: add more intrinsic case Intrinsic::experimental_stepvector: { - unsigned Cost = 1; // vid auto LT = getTypeLegalizationCost(RetTy); - return Cost + (LT.first - 1); + // Legalisation of illegal types involves an `index' instruction plus + // (LT.first - 1) vector adds. + if (ST->hasVInstructions()) + return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) + + (LT.first - 1) * + getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind); + return 1 + (LT.first - 1); } case Intrinsic::vp_rint: { // RISC-V target uses at least 5 instructions to lower rounding intrinsics. diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index ac32aea..c0169ea 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -78,6 +78,22 @@ public: const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); + /// \name EVL Support for predicated vectorization. + /// Whether the target supports the %evl parameter of VP intrinsic efficiently + /// in hardware, for the given opcode and type/alignment. (see LLVM Language + /// Reference - "Vector Predication Intrinsics", + /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and + /// "IR-level VP intrinsics", + /// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics). + /// \param Opcode the opcode of the instruction checked for predicated version + /// support. + /// \param DataType the type of the instruction with the \p Opcode checked for + /// prediction support. + /// \param Alignment the alignment for memory access operation checked for + /// predicated version support. + bool hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const; + TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth); bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 1674cef..9e4ba21 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -243,8 +243,12 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, continue; MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1)); - SPIRVType *ElementType = GR->getOrCreateSPIRVType( - cast<ConstantAsMetadata>(VMD->getMetadata())->getType(), MIRBuilder); + Type *ElementTy = cast<ConstantAsMetadata>(VMD->getMetadata())->getType(); + if (isUntypedPointerTy(ElementTy)) + ElementTy = + TypedPointerType::get(IntegerType::getInt8Ty(II->getContext()), + getPointerAddressSpace(ElementTy)); + SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder); return GR->getOrCreateSPIRVPointerType( ElementType, MIRBuilder, addressSpaceToStorageClass( diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index e0099e5..ac79937 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -47,7 +47,7 @@ class SPIRVGlobalRegistry { DenseMap<const MachineOperand *, const Function *> InstrToFunction; // Maps Functions to their calls (in a form of the machine instruction, // OpFunctionCall) that happened before the definition is available - DenseMap<const Function *, SmallVector<MachineInstr *>> ForwardCalls; + DenseMap<const Function *, SmallPtrSet<MachineInstr *, 8>> ForwardCalls; // Look for an equivalent of the newType in the map. Return the equivalent // if it's found, otherwise insert newType to the map and return the type. @@ -215,12 +215,12 @@ public: if (It == ForwardCalls.end()) ForwardCalls[F] = {MI}; else - It->second.push_back(MI); + It->second.insert(MI); } // Map a Function to the vector of machine instructions that represents // forward function calls or to nullptr if not found. - SmallVector<MachineInstr *> *getForwardCalls(const Function *F) { + SmallPtrSet<MachineInstr *, 8> *getForwardCalls(const Function *F) { auto It = ForwardCalls.find(F); return It == ForwardCalls.end() ? nullptr : &It->second; } diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp index 90a3155..d450078 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp @@ -193,7 +193,7 @@ void validateForwardCalls(const SPIRVSubtarget &STI, MachineRegisterInfo *DefMRI, SPIRVGlobalRegistry &GR, MachineInstr &FunDef) { const Function *F = GR.getFunctionByDefinition(&FunDef); - if (SmallVector<MachineInstr *> *FwdCalls = GR.getForwardCalls(F)) + if (SmallPtrSet<MachineInstr *, 8> *FwdCalls = GR.getForwardCalls(F)) for (MachineInstr *FunCall : *FwdCalls) { MachineRegisterInfo *CallMRI = &FunCall->getParent()->getParent()->getRegInfo(); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index f4525e71..49749b5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -1825,7 +1825,24 @@ bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg, bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { - return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable)) + // Change order of instructions if needed: all OpVariable instructions in a + // function must be the first instructions in the first block + MachineFunction *MF = I.getParent()->getParent(); + MachineBasicBlock *MBB = &MF->front(); + auto It = MBB->SkipPHIsAndLabels(MBB->begin()), E = MBB->end(); + bool IsHeader = false; + unsigned Opcode; + for (; It != E && It != I; ++It) { + Opcode = It->getOpcode(); + if (Opcode == SPIRV::OpFunction || Opcode == SPIRV::OpFunctionParameter) { + IsHeader = true; + } else if (IsHeader && + !(Opcode == SPIRV::ASSIGN_TYPE || Opcode == SPIRV::OpLabel)) { + ++It; + break; + } + } + return BuildMI(*MBB, It, It->getDebugLoc(), TII.get(SPIRV::OpVariable)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function)) diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index 215a8ea..6855471 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, default: // See if this is a generic print operand return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); + case 'L': // Low order register of a twin word register operand + case 'H': // High order register of a twin word register operand + { + const SparcSubtarget &Subtarget = MF->getSubtarget<SparcSubtarget>(); + const MachineOperand &MO = MI->getOperand(OpNo); + const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo(); + Register MOReg = MO.getReg(); + + Register HiReg, LoReg; + if (!SP::IntPairRegClass.contains(MOReg)) { + // If we aren't given a register pair already, find out which pair it + // belongs to. Note that here, the specified register operand, which + // refers to the high part of the twinword, needs to be an even-numbered + // register. + MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even, + &SP::IntPairRegClass); + if (!MOReg) { + SMLoc Loc; + OutContext.reportError( + Loc, "Hi part of pair should point to an even-numbered register"); + OutContext.reportError( + Loc, "(note that in some cases it might be necessary to manually " + "bind the input/output registers instead of relying on " + "automatic allocation)"); + return true; + } + } + + HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even); + LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd); + + Register Reg; + switch (ExtraCode[0]) { + case 'L': + Reg = LoReg; + break; + case 'H': + Reg = HiReg; + break; + } + + O << '%' << SparcInstPrinter::getRegisterName(Reg); + return false; + } case 'f': case 'r': break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a9751e1..6f65344 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( switch (Op.getOpcode()) { case X86ISD::PSHUFD: case X86ISD::VPERMILPI: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: return false; } return TargetLowering::canCreateUndefOrPoisonForTargetNode( diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ce3b6af..270dd32 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -2161,6 +2161,11 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> { def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>; def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>; def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>; + + def : Pat<(or_is_add GR8:$src, 1), (!cast<Instruction>(INC8r#suffix) GR8:$src)>; + def : Pat<(or_is_add GR16:$src, 1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>; + def : Pat<(or_is_add GR32:$src, 1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>; + def : Pat<(or_is_add GR64:$src, 1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>; } } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index f243343..a5b2e48 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6276,10 +6276,10 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, case X86::RCPSSm: case X86::RCPSSr_Int: case X86::RCPSSm_Int: - case X86::ROUNDSDr: - case X86::ROUNDSDm: - case X86::ROUNDSSr: - case X86::ROUNDSSm: + case X86::ROUNDSDri: + case X86::ROUNDSDmi: + case X86::ROUNDSSri: + case X86::ROUNDSSmi: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: @@ -6778,14 +6778,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, case X86::VRCPSSr_Int: case X86::VRCPSSm: case X86::VRCPSSm_Int: - case X86::VROUNDSDr: - case X86::VROUNDSDm: - case X86::VROUNDSDr_Int: - case X86::VROUNDSDm_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSm: - case X86::VROUNDSSr_Int: - case X86::VROUNDSSm_Int: + case X86::VROUNDSDri: + case X86::VROUNDSDmi: + case X86::VROUNDSDri_Int: + case X86::VROUNDSDmi_Int: + case X86::VROUNDSSri: + case X86::VROUNDSSmi: + case X86::VROUNDSSri_Int: + case X86::VROUNDSSmi_Int: case X86::VRSQRTSSr: case X86::VRSQRTSSr_Int: case X86::VRSQRTSSm: @@ -7516,8 +7516,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VRCPSSr_Int: case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int: - case X86::ROUNDSSr_Int: - case X86::VROUNDSSr_Int: + case X86::ROUNDSSri_Int: + case X86::VROUNDSSri_Int: case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int: @@ -7685,8 +7685,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VCVTSD2USI64Zrr_Int: case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int: - case X86::ROUNDSDr_Int: - case X86::VROUNDSDr_Int: + case X86::ROUNDSDri_Int: + case X86::VROUNDSDri_Int: case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 69d4536..2b391b6 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5475,35 +5475,35 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, // Intrinsic operation, reg. // Vector intrinsic operation, reg let Uses = [MXCSR], mayRaiseFPException = 1 in { - def r : SS4AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, - Sched<[sched]>; + def ri : SS4AIi8<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, + Sched<[sched]>; // Vector intrinsic operation, mem - def m : SS4AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, - Sched<[sched.Folded]>; + def mi : SS4AIi8<opc, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, + Sched<[sched.Folded]>; } } multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { - def SSr : SS4AIi8<opcss, MRMSrcReg, + def SSri : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[sched]>; let mayLoad = 1 in - def SSm : SS4AIi8<opcss, MRMSrcMem, + def SSmi : SS4AIi8<opcss, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -5511,14 +5511,14 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { - def SDr : SS4AIi8<opcsd, MRMSrcReg, + def SDri : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[sched]>; let mayLoad = 1 in - def SDm : SS4AIi8<opcsd, MRMSrcMem, + def SDmi : SS4AIi8<opcsd, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -5530,44 +5530,44 @@ multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { let Uses = [MXCSR], mayRaiseFPException = 1 in { let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { - def SSr : SS4AIi8<opcss, MRMSrcReg, - (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched]>; + def SSri : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched]>; let mayLoad = 1 in - def SSm : SS4AIi8<opcss, MRMSrcMem, - (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched.Folded, sched.ReadAfterFold]>; + def SSmi : SS4AIi8<opcss, MRMSrcMem, + (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { - def SDr : SS4AIi8<opcsd, MRMSrcReg, - (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched]>; + def SDri : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched]>; let mayLoad = 1 in - def SDm : SS4AIi8<opcsd, MRMSrcMem, - (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched.Folded, sched.ReadAfterFold]>; + def SDmi : SS4AIi8<opcsd, MRMSrcMem, + (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } } -multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, - string OpcodeStr, X86FoldableSchedWrite sched, - ValueType VT32, ValueType VT64, - SDNode OpNode, bit Is2Addr = 1> { +multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, X86FoldableSchedWrite sched, + ValueType VT32, ValueType VT64, + SDNode OpNode, bit Is2Addr = 1> { let Uses = [MXCSR], mayRaiseFPException = 1 in { let ExeDomain = SSEPackedSingle in { - def SSr_Int : SS4AIi8<opcss, MRMSrcReg, + def SSri_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5577,7 +5577,7 @@ let ExeDomain = SSEPackedSingle in { [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; - def SSm_Int : SS4AIi8<opcss, MRMSrcMem, + def SSmi_Int : SS4AIi8<opcss, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5590,7 +5590,7 @@ let ExeDomain = SSEPackedSingle in { } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 let ExeDomain = SSEPackedDouble in { - def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, + def SDri_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5600,7 +5600,7 @@ let ExeDomain = SSEPackedDouble in { [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; - def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, + def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5636,25 +5636,25 @@ let Predicates = [HasAVX, NoVLX] in { } } let Predicates = [UseAVX] in { - defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, - v4f32, v2f64, X86RndScales, 0>, - VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; + defm VROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, + v4f32, v2f64, X86RndScales, 0>, + VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; } let Predicates = [UseAVX] in { def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; + (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; + (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; + (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; + (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; } let ExeDomain = SSEPackedSingle in @@ -5667,21 +5667,21 @@ defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; let Constraints = "$src1 = $dst" in -defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, - v4f32, v2f64, X86RndScales>; +defm ROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, + v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), - (ROUNDSSr FR32:$src1, timm:$src2)>; + (ROUNDSSri FR32:$src1, timm:$src2)>; def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), - (ROUNDSDr FR64:$src1, timm:$src2)>; + (ROUNDSDri FR64:$src1, timm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), - (ROUNDSSm addr:$src1, timm:$src2)>; + (ROUNDSSmi addr:$src1, timm:$src2)>; def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), - (ROUNDSDm addr:$src1, timm:$src2)>; + (ROUNDSDmi addr:$src1, timm:$src2)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 0027de8..63ac910 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -324,14 +324,14 @@ defm : BWWriteResPair<WriteFMAX, [BWPort01], 5, [1], 1, 5>; // Fused Multiply defm : BWWriteResPair<WriteFMAY, [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM). defm : X86WriteResPairUnsupported<WriteFMAZ>; defm : BWWriteResPair<WriteDPPD, [BWPort0,BWPort1,BWPort5], 9, [1,1,1], 3, 5>; // Floating point double dot product. -defm : BWWriteResPair<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product. -defm : BWWriteResPair<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM). +defm : X86WriteRes<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>; +defm : X86WriteRes<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>; +defm : X86WriteRes<WriteDPPSLd, [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 19, [2,1,1,1,1], 6>; +defm : X86WriteRes<WriteDPPSYLd, [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 20, [2,1,1,1,1], 6>; defm : BWWriteResPair<WriteFSign, [BWPort5], 1>; // Floating point fabs/fchs. -defm : X86WriteRes<WriteFRnd, [BWPort23], 6, [1], 1>; // Floating point rounding. -defm : X86WriteRes<WriteFRndY, [BWPort23], 6, [1], 1>; // Floating point rounding (YMM/ZMM). +defm : BWWriteResPair<WriteFRnd, [BWPort1], 6, [2], 2, 5>; // Floating point rounding. +defm : BWWriteResPair<WriteFRndY, [BWPort1], 6, [2], 2, 6>; // Floating point rounding (YMM/ZMM). defm : X86WriteResPairUnsupported<WriteFRndZ>; -defm : X86WriteRes<WriteFRndLd, [BWPort1,BWPort23], 11, [2,1], 3>; -defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>; defm : BWWriteResPair<WriteFLogic, [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals. defm : BWWriteResPair<WriteFLogicY, [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM). defm : X86WriteResPairUnsupported<WriteFLogicZ>; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index a11b470..516dc62 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -324,15 +324,14 @@ defm : HWWriteResPair<WriteFMAX, [HWPort01], 5, [1], 1, 6>; defm : HWWriteResPair<WriteFMAY, [HWPort01], 5, [1], 1, 7>; defm : HWWriteResPair<WriteFMAZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1 defm : HWWriteResPair<WriteDPPD, [HWPort0,HWPort1,HWPort5], 9, [1,1,1], 3, 6>; -defm : HWWriteResPair<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>; -defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; +defm : X86WriteRes<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>; +defm : X86WriteRes<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>; +defm : X86WriteRes<WriteDPPSLd, [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 20, [2,1,1,1,1], 6>; +defm : X86WriteRes<WriteDPPSYLd, [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 21, [2,1,1,1,1], 6>; defm : HWWriteResPair<WriteFSign, [HWPort0], 1>; -defm : X86WriteRes<WriteFRnd, [HWPort23], 6, [1], 1>; -defm : X86WriteRes<WriteFRndY, [HWPort23], 6, [1], 1>; -defm : X86WriteRes<WriteFRndZ, [HWPort23], 6, [1], 1>; // Unsupported = 1 -defm : X86WriteRes<WriteFRndLd, [HWPort1,HWPort23], 12, [2,1], 3>; -defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>; -defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1 +defm : HWWriteResPair<WriteFRnd, [HWPort1], 6, [2], 2, 6>; +defm : HWWriteResPair<WriteFRndY, [HWPort1], 6, [2], 2, 7>; +defm : HWWriteResPair<WriteFRndZ, [HWPort1], 6, [2], 2, 7>; // Unsupported = 1 defm : HWWriteResPair<WriteFLogic, [HWPort5], 1, [1], 1, 6>; defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>; defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td index 88bb9ad..ff3fe32 100644 --- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td +++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td @@ -2290,8 +2290,8 @@ def SPRWriteResGroup218 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> { let Latency = 15; let NumMicroOps = 3; } -def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)m$")>; -def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)m((_Int)?)$", +def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)mi$")>; +def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)mi((_Int)?)$", "^VRNDSCALEP(D|S)Z128rm(bi|ik)$", "^VRNDSCALEP(D|S)Z128rmbik(z?)$", "^VRNDSCALEP(D|S)Z128rmi((kz)?)$", @@ -2303,13 +2303,13 @@ def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> { let Latency = 8; let NumMicroOps = 2; } -def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)r$", - "^(V?)ROUND(PS|SD)r$", - "^(V?)ROUNDS(D|S)r_Int$", +def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$", + "^(V?)ROUND(PS|SD)ri$", + "^(V?)ROUNDS(D|S)ri_Int$", "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$", "^VRNDSCALES(D|S)Zr$", "^VRNDSCALES(D|S)Zr(b?)_Int((k|kz)?)$", - "^VROUNDP(D|S)Yr$")>; + "^VROUNDP(D|S)Yri$")>; def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> { let ReleaseAtCycles = [2]; @@ -3737,7 +3737,7 @@ def SPRWriteResGroup390 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> { let NumMicroOps = 3; } def : InstRW<[SPRWriteResGroup390], (instregex "^VF(C?)MADDCPHZ(128|256)m(b?)$", - "^VROUNDP(D|S)Ym$")>; + "^VROUNDP(D|S)Ymi$")>; def : InstRW<[SPRWriteResGroup390, ReadAfterVecXLd], (instregex "^VF(C?)MADDCSHZm$", "^VF(C?)MULCPHZ128rm(b?)$", "^VF(C?)MULCSHZrm$", diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 4fa138f..3ee931f 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -311,8 +311,10 @@ defm : SKLWriteResPair<WriteFMAX, [SKLPort01], 4, [1], 1, 6>; defm : SKLWriteResPair<WriteFMAY, [SKLPort01], 4, [1], 1, 7>; defm : X86WriteResPairUnsupported<WriteFMAZ>; defm : SKLWriteResPair<WriteDPPD, [SKLPort5,SKLPort01], 9, [1,2], 3, 6>; // Floating point double dot product. -defm : SKLWriteResPair<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4, 6>; -defm : SKLWriteResPair<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4, 7>; +defm : X86WriteRes<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4>; +defm : X86WriteRes<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4>; +defm : X86WriteRes<WriteDPPSLd, [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 19, [1,3,1,1], 6>; +defm : X86WriteRes<WriteDPPSYLd, [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 20, [1,3,1,1], 6>; defm : SKLWriteResPair<WriteFSign, [SKLPort0], 1>; // Floating point fabs/fchs. defm : SKLWriteResPair<WriteFRnd, [SKLPort01], 8, [2], 2, 6>; // Floating point rounding. defm : SKLWriteResPair<WriteFRndY, [SKLPort01], 8, [2], 2, 7>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 3da688c..a7dff0e 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -311,8 +311,10 @@ defm : SKXWriteResPair<WriteFMAX, [SKXPort01], 4, [1], 1, 6>; defm : SKXWriteResPair<WriteFMAY, [SKXPort01], 4, [1], 1, 7>; defm : SKXWriteResPair<WriteFMAZ, [SKXPort05], 4, [1], 1, 7>; defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015], 9, [1,2], 3, 6>; // Floating point double dot product. -defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>; -defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>; +defm : X86WriteRes<WriteDPPS, [SKXPort5,SKXPort01], 13, [1,3], 4>; +defm : X86WriteRes<WriteDPPSY, [SKXPort5,SKXPort01], 13, [1,3], 4>; +defm : X86WriteRes<WriteDPPSLd, [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 19, [1,3,1,1], 6>; +defm : X86WriteRes<WriteDPPSYLd, [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 20, [1,3,1,1], 6>; defm : SKXWriteResPair<WriteFSign, [SKXPort0], 1>; // Floating point fabs/fchs. defm : SKXWriteResPair<WriteFRnd, [SKXPort01], 8, [2], 2, 6>; // Floating point rounding. defm : SKXWriteResPair<WriteFRndY, [SKXPort01], 8, [2], 2, 7>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td index d90c8bd..2e87d52 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -52,7 +52,7 @@ def Znver3Model : SchedMachineModel { int VecLoadLatency = 7; // Latency of a simple store operation. int StoreLatency = 1; - // FIXME + // FIXME: let HighLatency = 25; // FIXME: any better choice? // AMD SOG 19h, 2.8 Optimizing Branching // The branch misprediction penalty is in the range from 11 to 18 cycles, @@ -193,11 +193,11 @@ def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 // <...>, and six FPU pipes. // Agner, 22.10 Floating point execution pipes // There are six floating point/vector execution pipes, -def Zn3FPP0 : ProcResource<1>; -def Zn3FPP1 : ProcResource<1>; -def Zn3FPP2 : ProcResource<1>; -def Zn3FPP3 : ProcResource<1>; -def Zn3FPP45 : ProcResource<2>; +def Zn3FP0 : ProcResource<1>; +def Zn3FP1 : ProcResource<1>; +def Zn3FP2 : ProcResource<1>; +def Zn3FP3 : ProcResource<1>; +def Zn3FP45 : ProcResource<2>; // // Execution Units @@ -205,63 +205,63 @@ def Zn3FPP45 : ProcResource<2>; // AMD SOG 19h, 2.11.1 Floating Point Execution Resources // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) -defvar Zn3FPFMul0 = Zn3FPP0; -defvar Zn3FPFMul1 = Zn3FPP1; +defvar Zn3FPFMul0 = Zn3FP0; +defvar Zn3FPFMul1 = Zn3FP1; // (v)FADD* -defvar Zn3FPFAdd0 = Zn3FPP2; -defvar Zn3FPFAdd1 = Zn3FPP3; +defvar Zn3FPFAdd0 = Zn3FP2; +defvar Zn3FPFAdd1 = Zn3FP3; // All convert operations except pack/unpack -defvar Zn3FPFCvt0 = Zn3FPP2; -defvar Zn3FPFCvt1 = Zn3FPP3; +defvar Zn3FPFCvt0 = Zn3FP2; +defvar Zn3FPFCvt1 = Zn3FP3; // All Divide and Square Root except Reciprocal Approximation // AMD SOG 19h, 2.11.1 Floating Point Execution Resources // FDIV unit can support 2 simultaneous operations in flight // even though it occupies a single pipe. // FIXME: BufferSize=2 ? -defvar Zn3FPFDiv = Zn3FPP1; +defvar Zn3FPFDiv = Zn3FP1; // Moves and Logical operations on Floating Point Data Types -defvar Zn3FPFMisc0 = Zn3FPP0; -defvar Zn3FPFMisc1 = Zn3FPP1; -defvar Zn3FPFMisc2 = Zn3FPP2; -defvar Zn3FPFMisc3 = Zn3FPP3; +defvar Zn3FPFMisc0 = Zn3FP0; +defvar Zn3FPFMisc1 = Zn3FP1; +defvar Zn3FPFMisc2 = Zn3FP2; +defvar Zn3FPFMisc3 = Zn3FP3; // Integer Adds, Subtracts, and Compares // Some complex VADD operations are not available in all pipes. -defvar Zn3FPVAdd0 = Zn3FPP0; -defvar Zn3FPVAdd1 = Zn3FPP1; -defvar Zn3FPVAdd2 = Zn3FPP2; -defvar Zn3FPVAdd3 = Zn3FPP3; +defvar Zn3FPVAdd0 = Zn3FP0; +defvar Zn3FPVAdd1 = Zn3FP1; +defvar Zn3FPVAdd2 = Zn3FP2; +defvar Zn3FPVAdd3 = Zn3FP3; // Integer Multiplies, SAD, Blendvb -defvar Zn3FPVMul0 = Zn3FPP0; -defvar Zn3FPVMul1 = Zn3FPP3; +defvar Zn3FPVMul0 = Zn3FP0; +defvar Zn3FPVMul1 = Zn3FP3; // Data Shuffles, Packs, Unpacks, Permute // Some complex shuffle operations are only available in pipe1. -defvar Zn3FPVShuf = Zn3FPP1; -defvar Zn3FPVShufAux = Zn3FPP2; +defvar Zn3FPVShuf = Zn3FP1; +defvar Zn3FPVShufAux = Zn3FP2; // Bit Shift Left/Right operations -defvar Zn3FPVShift0 = Zn3FPP1; -defvar Zn3FPVShift1 = Zn3FPP2; +defvar Zn3FPVShift0 = Zn3FP1; +defvar Zn3FPVShift1 = Zn3FP2; // Moves and Logical operations on Packed Integer Data Types -defvar Zn3FPVMisc0 = Zn3FPP0; -defvar Zn3FPVMisc1 = Zn3FPP1; -defvar Zn3FPVMisc2 = Zn3FPP2; -defvar Zn3FPVMisc3 = Zn3FPP3; +defvar Zn3FPVMisc0 = Zn3FP0; +defvar Zn3FPVMisc1 = Zn3FP1; +defvar Zn3FPVMisc2 = Zn3FP2; +defvar Zn3FPVMisc3 = Zn3FP3; // *AES* -defvar Zn3FPAES0 = Zn3FPP0; -defvar Zn3FPAES1 = Zn3FPP1; +defvar Zn3FPAES0 = Zn3FP0; +defvar Zn3FPAES1 = Zn3FP1; // *CLM* -defvar Zn3FPCLM0 = Zn3FPP0; -defvar Zn3FPCLM1 = Zn3FPP1; +defvar Zn3FPCLM0 = Zn3FP0; +defvar Zn3FPCLM1 = Zn3FP1; // Execution pipeline grouping //===----------------------------------------------------------------------===// @@ -269,7 +269,7 @@ defvar Zn3FPCLM1 = Zn3FPP1; // AMD SOG 19h, 2.11 Floating-Point Unit // Stores and floating point to general purpose register transfer // have 2 dedicated pipelines (pipe 5 and 6). -def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>; +def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>; // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; @@ -293,12 +293,12 @@ def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; // AMD SOG 19h, 2.11 Floating-Point Unit // Stores and floating point to general purpose register transfer // have 2 dedicated pipelines (pipe 5 and 6). -defvar Zn3FPLd01 = Zn3FPP45; +defvar Zn3FPLd01 = Zn3FP45; // AMD SOG 19h, 2.11 Floating-Point Unit // Note that FP stores are supported on two pipelines, // but throughput is limited to one per cycle. -let Super = Zn3FPP45 in +let Super = Zn3FP45 in def Zn3FPSt : ProcResource<1>; // Integer Adds, Subtracts, and Compares @@ -345,8 +345,8 @@ def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], // AMD SOG 19h, 2.11 Floating-Point Unit // <...> the scheduler can issue 1 micro op per cycle for each pipe. // FIXME: those are two separate schedulers, not a single big one. -def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0 - Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1 +def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2, /*Zn3FP4,*/ // scheduler 0 + Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/ // scheduler 1 ]> { let BufferSize = !mul(2, 32); } @@ -838,9 +838,9 @@ defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis // Floating point. This covers both scalar and vector operations. -defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; -defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; -defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; +defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; +defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; +defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 276bc7f..86b4560 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -211,8 +211,9 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, bool JIT, } static CodeModel::Model -getEffectiveX86CodeModel(std::optional<CodeModel::Model> CM, bool JIT, - bool Is64Bit) { +getEffectiveX86CodeModel(const Triple &TT, std::optional<CodeModel::Model> CM, + bool JIT) { + bool Is64Bit = TT.getArch() == Triple::x86_64; if (CM) { if (*CM == CodeModel::Tiny) report_fatal_error("Target does not support the tiny CodeModel", false); @@ -234,7 +235,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine( T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(TT, JIT, RM), - getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64), + getEffectiveX86CodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) { // On PS4/PS5, the "return address" of a 'noreturn' call must still be within diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 2ec2946..cd61029 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2664,9 +2664,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, }; static const TypeConversionCostTblEntry AVXConversionTbl[] = { - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, |