diff options
Diffstat (limited to 'llvm/lib/Target/RISCV')
63 files changed, 3884 insertions, 890 deletions
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index edde7ac..9bb3724 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -222,7 +222,6 @@ class RISCVAsmParser : public MCTargetAsmParser { ParseStatus parseRegReg(OperandVector &Operands); ParseStatus parseXSfmmVType(OperandVector &Operands); - ParseStatus parseRetval(OperandVector &Operands); ParseStatus parseZcmpStackAdj(OperandVector &Operands, bool ExpectNegative = false); ParseStatus parseZcmpNegStackAdj(OperandVector &Operands) { @@ -352,7 +351,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { } Kind; struct RegOp { - MCRegister RegNum; + MCRegister Reg; bool IsGPRAsFPR; }; @@ -461,20 +460,18 @@ public: bool isReg() const override { return Kind == KindTy::Register; } bool isExpr() const { return Kind == KindTy::Expression; } bool isV0Reg() const { - return Kind == KindTy::Register && Reg.RegNum == RISCV::V0; + return Kind == KindTy::Register && Reg.Reg == RISCV::V0; } bool isAnyReg() const { return Kind == KindTy::Register && - (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.RegNum)); + (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.Reg)); } bool isAnyRegC() const { return Kind == KindTy::Register && - (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains( - Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains( - Reg.RegNum)); + (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg.Reg)); } bool isImm() const override { return isExpr(); } bool isMem() const override { return false; } @@ -488,35 +485,33 @@ public: bool isGPR() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg); } bool isGPRPair() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains( - Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(Reg.Reg); } bool isGPRPairC() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains( - Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains(Reg.Reg); } bool isGPRPairNoX0() const { return Kind == KindTy::Register && RISCVMCRegisterClasses[RISCV::GPRPairNoX0RegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isGPRF16() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.Reg); } bool isGPRF32() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.Reg); } bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } @@ -991,7 +986,7 @@ public: MCRegister getReg() const override { assert(Kind == KindTy::Register && "Invalid type access!"); - return Reg.RegNum; + return Reg.Reg; } StringRef getSysReg() const { @@ -1047,7 +1042,7 @@ public: OS << "<fpimm: " << FPImm.Val << ">"; break; case KindTy::Register: - OS << "<reg: " << RegName(Reg.RegNum) << " (" << Reg.RegNum + OS << "<reg: " << RegName(Reg.Reg) << " (" << Reg.Reg.id() << (Reg.IsGPRAsFPR ? ") GPRasFPR>" : ")>"); break; case KindTy::Token: @@ -1099,7 +1094,7 @@ public: static std::unique_ptr<RISCVOperand> createReg(MCRegister Reg, SMLoc S, SMLoc E, bool IsGPRAsFPR = false) { auto Op = std::make_unique<RISCVOperand>(KindTy::Register); - Op->Reg.RegNum = Reg; + Op->Reg.Reg = Reg; Op->Reg.IsGPRAsFPR = IsGPRAsFPR; Op->StartLoc = S; Op->EndLoc = E; @@ -1335,28 +1330,28 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, bool IsRegVR = RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg); if (IsRegFPR64 && Kind == MCK_FPR128) { - Op.Reg.RegNum = convertFPR64ToFPR128(Reg); + Op.Reg.Reg = convertFPR64ToFPR128(Reg); return Match_Success; } // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary. if ((IsRegFPR64 && Kind == MCK_FPR32) || (IsRegFPR64C && Kind == MCK_FPR32C)) { - Op.Reg.RegNum = convertFPR64ToFPR32(Reg); + Op.Reg.Reg = convertFPR64ToFPR32(Reg); return Match_Success; } // As the parser couldn't differentiate an FPR16 from an FPR64, coerce the // register from FPR64 to FPR16 if necessary. if (IsRegFPR64 && Kind == MCK_FPR16) { - Op.Reg.RegNum = convertFPR64ToFPR16(Reg); + Op.Reg.Reg = convertFPR64ToFPR16(Reg); return Match_Success; } if (Kind == MCK_GPRAsFPR16 && Op.isGPRAsFPR()) { - Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_H; + Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_H; return Match_Success; } if (Kind == MCK_GPRAsFPR32 && Op.isGPRAsFPR()) { - Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_W; + Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_W; return Match_Success; } @@ -1372,8 +1367,8 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // As the parser couldn't differentiate an VRM2/VRM4/VRM8 from an VR, coerce // the register from VR to VRM2/VRM4/VRM8 if necessary. if (IsRegVR && (Kind == MCK_VRM2 || Kind == MCK_VRM4 || Kind == MCK_VRM8)) { - Op.Reg.RegNum = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind); - if (!Op.Reg.RegNum) + Op.Reg.Reg = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind); + if (!Op.Reg.Reg) return Match_InvalidOperand; return Match_Success; } @@ -1659,10 +1654,6 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -1, (1 << 5) - 1, "immediate must be non-zero in the range"); - case Match_InvalidXSfmmVType: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return generateXSfmmVTypeError(ErrorLoc); - } case Match_InvalidVTypeI: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return generateVTypeError(ErrorLoc); @@ -4091,6 +4082,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return false; } + case RISCV::PseudoCV_ELW: + emitLoadStoreSymbol(Inst, RISCV::CV_ELW, IDLoc, Out, /*HasTmpReg=*/false); + return false; } emitToStreamer(Out, Inst); diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index e9088a4..f8cf71e 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -72,6 +72,7 @@ add_llvm_target(RISCVCodeGen RISCVVLOptimizer.cpp RISCVVMV0Elimination.cpp RISCVZacasABIFix.cpp + RISCVZilsdOptimizer.cpp GISel/RISCVCallLowering.cpp GISel/RISCVInstructionSelector.cpp GISel/RISCVLegalizerInfo.cpp diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 3d5a55c..4f2e633 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -99,6 +99,7 @@ private: LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; + bool selectExtractSubvector(MachineInstr &MI, MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, unsigned ShiftWidth) const; @@ -967,6 +968,45 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( } } +bool RISCVInstructionSelector::selectExtractSubvector( + MachineInstr &MI, MachineIRBuilder &MIB) const { + assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_SUBVECTOR); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + LLT DstTy = MRI->getType(DstReg); + LLT SrcTy = MRI->getType(SrcReg); + + unsigned Idx = static_cast<unsigned>(MI.getOperand(2).getImm()); + + MVT DstMVT = getMVTForLLT(DstTy); + MVT SrcMVT = getMVTForLLT(SrcTy); + + unsigned SubRegIdx; + std::tie(SubRegIdx, Idx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + SrcMVT, DstMVT, Idx, &TRI); + + if (Idx != 0) + return false; + + unsigned DstRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(DstMVT); + const TargetRegisterClass *DstRC = TRI.getRegClass(DstRegClassID); + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) + return false; + + unsigned SrcRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(SrcMVT); + const TargetRegisterClass *SrcRC = TRI.getRegClass(SrcRegClassID); + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + return false; + + MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(SrcReg, 0, SubRegIdx); + + MI.eraseFromParent(); + return true; +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -1239,6 +1279,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectIntrinsicWithSideEffects(MI, MIB); + case TargetOpcode::G_EXTRACT_SUBVECTOR: + return selectExtractSubvector(MI, MIB); default: return false; } @@ -1569,7 +1611,7 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI, switch (TM.getCodeModel()) { default: { - reportGISelFailure(*MF, *TPC, *MORE, getName(), + reportGISelFailure(*MF, *MORE, getName(), "Unsupported code model for lowering", MI); return false; } diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index b1794b7..2cc594a 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -238,7 +238,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .clampScalar(0, sXLen, sXLen) .scalarSameSizeAs(1, 0); } else { - CTPOPActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower(); + CTPOPActions.widenScalarToNextPow2(0, /*Min*/ 8) + .clampScalar(0, s8, sXLen) + .scalarSameSizeAs(1, 0) + .lower(); } getActionDefinitionsBuilder(G_CONSTANT) @@ -1208,7 +1211,7 @@ bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI, // to place the desired subvector starting at element 0. const LLT XLenTy(STI.getXLenVT()); auto SlidedownAmt = MIB.buildVScale(XLenTy, RemIdx); - auto [Mask, VL] = buildDefaultVLOps(LitTy, MIB, MRI); + auto [Mask, VL] = buildDefaultVLOps(InterLitTy, MIB, MRI); uint64_t Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC; auto Slidedown = MIB.buildInstr( RISCV::G_VSLIDEDOWN_VL, {InterLitTy}, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 5b8cfb2..dbf5cfe 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -393,7 +393,6 @@ enum OperandType : unsigned { OPERAND_UIMM14_LSB00, OPERAND_UIMM16, OPERAND_UIMM16_NONZERO, - OPERAND_UIMM20, OPERAND_UIMMLOG2XLEN, OPERAND_UIMMLOG2XLEN_NONZERO, OPERAND_UIMM32, @@ -412,13 +411,11 @@ enum OperandType : unsigned { OPERAND_SIMM10_LSB0000_NONZERO, OPERAND_SIMM10_UNSIGNED, OPERAND_SIMM11, - OPERAND_SIMM12, OPERAND_SIMM12_LSB00000, OPERAND_SIMM16, OPERAND_SIMM16_NONZERO, OPERAND_SIMM20_LI, OPERAND_SIMM26, - OPERAND_BARE_SIMM32, OPERAND_CLUI_IMM, OPERAND_VTYPEI10, OPERAND_VTYPEI11, @@ -447,6 +444,15 @@ enum OperandType : unsigned { // Vtype operand for XSfmm extension. OPERAND_XSFMM_VTYPE, OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE, + + OPERAND_UIMM20_LUI, + OPERAND_UIMM20_AUIPC, + + // Simm12 or constant pool, global, basicblock, etc. + OPERAND_SIMM12_LO, + + OPERAND_BARE_SIMM32, + // Operand is either a register or uimm5, this is used by V extension pseudo // instructions to represent a value that be passed as AVL to either vsetvli // or vsetivli. @@ -700,7 +706,7 @@ enum RLISTENCODE { inline unsigned encodeRegList(MCRegister EndReg, bool IsRVE = false) { assert((!IsRVE || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E"); - switch (EndReg) { + switch (EndReg.id()) { case RISCV::X1: return RLISTENCODE::RA; case RISCV::X8: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index 98c8738..a2b75e4 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -11,7 +11,6 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCFixup.h" -#include <utility> #undef RISCV diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 5934c91..fd460e4 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -725,7 +725,7 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo, MCOperand MO = MI.getOperand(OpNo); assert(MO.isReg() && "Expected a register."); - switch (MO.getReg()) { + switch (MO.getReg().id()) { default: llvm_unreachable("Invalid mask register."); case RISCV::V0: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 26f434b..cedaa86 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -79,6 +79,32 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI, } } + if (STI.hasFeature(RISCV::FeatureStdExtP)) { + // Check if the immediate is packed i8 or i10 + int32_t Bit63To32 = Val >> 32; + int32_t Bit31To0 = Val; + int16_t Bit31To16 = Bit31To0 >> 16; + int16_t Bit15To0 = Bit31To0; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Bit15To0; + if (Bit63To32 == Bit31To0) { + if (IsRV64 && isInt<10>(Bit63To32)) { + Res.emplace_back(RISCV::PLI_W, Bit63To32); + return; + } + if (Bit31To16 == Bit15To0) { + if (isInt<10>(Bit31To16)) { + Res.emplace_back(RISCV::PLI_H, Bit31To16); + return; + } + if (Bit15To8 == Bit7To0) { + Res.emplace_back(RISCV::PLI_B, Bit15To8); + return; + } + } + } + } + if (isInt<32>(Val)) { // Depending on the active bits in the immediate Value v, the following // instruction sequences are emitted: @@ -562,6 +588,9 @@ OpndKind Inst::getOpndKind() const { case RISCV::LUI: case RISCV::QC_LI: case RISCV::QC_E_LI: + case RISCV::PLI_B: + case RISCV::PLI_H: + case RISCV::PLI_W: return RISCVMatInt::Imm; case RISCV::ADD_UW: return RISCVMatInt::RegX0; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h index a82cd65..5df8edb 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h @@ -21,7 +21,7 @@ namespace RISCVMatInt { enum OpndKind { RegImm, // ADDI/ADDIW/XORI/SLLI/SRLI/SLLI_UW/RORI/BSETI/BCLRI/TH_SRRI - Imm, // LUI/QC_LI/QC_E_LI + Imm, // LUI/QC_LI/QC_E_LI/PLI_B/PLI_H/PLI_W RegReg, // SH1ADD/SH2ADD/SH3ADD/PACK RegX0, // ADD_UW }; diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 51e8e85..048db20 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -26,8 +26,16 @@ class RISCVRegisterBankInfo; class RISCVSubtarget; class RISCVTargetMachine; -FunctionPass *createRISCVCodeGenPreparePass(); -void initializeRISCVCodeGenPreparePass(PassRegistry &); +class RISCVCodeGenPreparePass : public PassInfoMixin<RISCVCodeGenPreparePass> { +private: + const RISCVTargetMachine *TM; + +public: + RISCVCodeGenPreparePass(const RISCVTargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; +FunctionPass *createRISCVCodeGenPrepareLegacyPass(); +void initializeRISCVCodeGenPrepareLegacyPassPass(PassRegistry &); FunctionPass *createRISCVDeadRegisterDefinitionsPass(); void initializeRISCVDeadRegisterDefinitionsPass(PassRegistry &); @@ -94,6 +102,9 @@ void initializeRISCVPushPopOptPass(PassRegistry &); FunctionPass *createRISCVLoadStoreOptPass(); void initializeRISCVLoadStoreOptPass(PassRegistry &); +FunctionPass *createRISCVPreAllocZilsdOptPass(); +void initializeRISCVPreAllocZilsdOptPass(PassRegistry &); + FunctionPass *createRISCVZacasABIFixPass(); void initializeRISCVZacasABIFixPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index b24d863..f6f82fd 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -96,6 +96,8 @@ def RISCVAsmWriter : AsmWriter { int PassSubtarget = 1; } +defm : RemapAllTargetPseudoPointerOperands<GPR>; + def RISCV : Target { let InstructionSet = RISCVInstrInfo; let AssemblyParsers = [RISCVAsmParser]; diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index ce34959..1ee4c66 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -33,20 +33,33 @@ using namespace llvm; #define PASS_NAME "RISC-V CodeGenPrepare" namespace { - -class RISCVCodeGenPrepare : public FunctionPass, - public InstVisitor<RISCVCodeGenPrepare, bool> { +class RISCVCodeGenPrepare : public InstVisitor<RISCVCodeGenPrepare, bool> { + Function &F; const DataLayout *DL; const DominatorTree *DT; const RISCVSubtarget *ST; public: + RISCVCodeGenPrepare(Function &F, const DominatorTree *DT, + const RISCVSubtarget *ST) + : F(F), DL(&F.getDataLayout()), DT(DT), ST(ST) {} + bool run(); + bool visitInstruction(Instruction &I) { return false; } + bool visitAnd(BinaryOperator &BO); + bool visitIntrinsicInst(IntrinsicInst &I); + bool expandVPStrideLoad(IntrinsicInst &I); + bool widenVPMerge(IntrinsicInst &I); +}; +} // namespace + +namespace { +class RISCVCodeGenPrepareLegacyPass : public FunctionPass { +public: static char ID; - RISCVCodeGenPrepare() : FunctionPass(ID) {} + RISCVCodeGenPrepareLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return PASS_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -54,15 +67,8 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); } - - bool visitInstruction(Instruction &I) { return false; } - bool visitAnd(BinaryOperator &BO); - bool visitIntrinsicInst(IntrinsicInst &I); - bool expandVPStrideLoad(IntrinsicInst &I); - bool widenVPMerge(IntrinsicInst &I); }; - -} // end anonymous namespace +} // namespace // Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set, // but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill @@ -265,25 +271,17 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { IRBuilder<> Builder(&II); Type *STy = VTy->getElementType(); Value *Val = Builder.CreateLoad(STy, BasePtr); - Value *Res = Builder.CreateIntrinsic(Intrinsic::experimental_vp_splat, {VTy}, - {Val, II.getOperand(2), VL}); + Value *Res = Builder.CreateIntrinsic( + Intrinsic::vp_merge, VTy, + {II.getOperand(2), Builder.CreateVectorSplat(VTy->getElementCount(), Val), + PoisonValue::get(VTy), VL}); II.replaceAllUsesWith(Res); II.eraseFromParent(); return true; } -bool RISCVCodeGenPrepare::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto &TPC = getAnalysis<TargetPassConfig>(); - auto &TM = TPC.getTM<RISCVTargetMachine>(); - ST = &TM.getSubtarget<RISCVSubtarget>(F); - - DL = &F.getDataLayout(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - +bool RISCVCodeGenPrepare::run() { bool MadeChange = false; for (auto &BB : F) for (Instruction &I : llvm::make_early_inc_range(BB)) @@ -292,12 +290,40 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) { return MadeChange; } -INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) +bool RISCVCodeGenPrepareLegacyPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<RISCVTargetMachine>(); + auto ST = &TM.getSubtarget<RISCVSubtarget>(F); + auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + RISCVCodeGenPrepare RVCGP(F, DT, ST); + return RVCGP.run(); +} + +INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, + false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) +INITIALIZE_PASS_END(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, false, + false) -char RISCVCodeGenPrepare::ID = 0; +char RISCVCodeGenPrepareLegacyPass::ID = 0; + +FunctionPass *llvm::createRISCVCodeGenPrepareLegacyPass() { + return new RISCVCodeGenPrepareLegacyPass(); +} -FunctionPass *llvm::createRISCVCodeGenPreparePass() { - return new RISCVCodeGenPrepare(); +PreservedAnalyses RISCVCodeGenPreparePass::run(Function &F, + FunctionAnalysisManager &FAM) { + DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F); + auto ST = &TM->getSubtarget<RISCVSubtarget>(F); + bool Changed = RISCVCodeGenPrepare(F, DT, ST).run(); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; } diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp index 51180f5..5d3d9b5 100644 --- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp +++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp @@ -59,7 +59,6 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n"); @@ -89,7 +88,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; MI.print(dbgs())); Register X0Reg; - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); if (RC && RC->contains(RISCV::X0)) { X0Reg = RISCV::X0; } else if (RC && RC->contains(RISCV::X0_W)) { diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index b0453fc..60e0afd 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -132,6 +132,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: case RISCV::PseudoCCMUL: + case RISCV::PseudoCCLUI: + case RISCV::PseudoCCQC_LI: + case RISCV::PseudoCCQC_E_LI: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -239,6 +242,9 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; + case RISCV::PseudoCCLUI: NewOpc = RISCV::LUI; break; + case RISCV::PseudoCCQC_LI: NewOpc = RISCV::QC_LI; break; + case RISCV::PseudoCCQC_E_LI: NewOpc = RISCV::QC_E_LI; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; @@ -268,6 +274,9 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, .add(MI.getOperand(5)) .add(MI.getOperand(6)) .add(MI.getOperand(7)); + } else if (NewOpc == RISCV::LUI || NewOpc == RISCV::QC_LI || + NewOpc == RISCV::QC_E_LI) { + BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg).add(MI.getOperand(5)); } else { BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) .add(MI.getOperand(5)) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 0b964c4..1a5bb83 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -194,6 +194,10 @@ def HasStdExtZilsd : Predicate<"Subtarget->hasStdExtZilsd()">, AssemblerPredicate<(all_of FeatureStdExtZilsd), "'Zilsd' (Load/Store pair instructions)">; +def FeatureZilsd4ByteAlign + : SubtargetFeature<"zilsd-4byte-align", "AllowZilsd4ByteAlign", "true", + "Allow 4-byte alignment for Zilsd LD/SD instructions">; + // Multiply Extensions def FeatureStdExtZmmul @@ -1102,38 +1106,18 @@ def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">, // Packed SIMD Extensions def FeatureStdExtP - : RISCVExperimentalExtension<0, 15, + : RISCVExperimentalExtension<0, 18, "'Base P' (Packed SIMD)">; def HasStdExtP : Predicate<"Subtarget->hasStdExtP()">, AssemblerPredicate<(all_of FeatureStdExtP), "'Base P' (Packed SIMD)">; -def HasStdExtZbaOrP - : Predicate<"Subtarget->hasStdExtZba() || Subtarget->hasStdExtP()">, - AssemblerPredicate<(any_of FeatureStdExtZba, FeatureStdExtP), - "'Zba' (Address Generation Instructions) or " - "'Base P' (Packed-SIMD)">; - -def HasStdExtZbbOrP - : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtP()">, - AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtP), - "'Zbb' (Basic Bit-Manipulation) or " - "'Base P' (Packed-SIMD)">; - def HasStdExtZbkbOrP : Predicate<"Subtarget->hasStdExtZbkb() || Subtarget->hasStdExtP()">, AssemblerPredicate<(any_of FeatureStdExtZbkb, FeatureStdExtP), "'Zbkb' (Bitmanip instructions for Cryptography) or " "'Base P' (Packed-SIMD)">; -def HasStdExtZbbOrZbkbOrP - : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || " - "Subtarget->hasStdExtP()">, - AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP), - "'Zbb' (Basic Bit-Manipulation) or " - "'Zbkb' (Bitmanip instructions for Cryptography) or " - "'Base P' (Packed-SIMD)">; - //===----------------------------------------------------------------------===// // Vendor extensions //===----------------------------------------------------------------------===// @@ -1787,6 +1771,45 @@ def FeatureUnalignedVectorMem "true", "Has reasonably performant unaligned vector " "loads and stores">; +// Assume that lock-free native-width atomics are available, even if the target +// and operating system combination would not usually provide them. The user +// is responsible for providing any necessary __sync implementations. Code +// built with this feature is not ABI-compatible with code built without this +// feature, if atomic variables are exposed across the ABI boundary. +def FeatureForcedAtomics : SubtargetFeature< + "forced-atomics", "HasForcedAtomics", "true", + "Assume that lock-free native-width atomics are available">; +def HasAtomicLdSt + : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; + +// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) +// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] +// in section 13.3. Eventual Success of Store-Conditional Instructions, defines +// _constrained_ LR/SC loops: +// The dynamic code executed between the LR and SC instructions can only +// contain instructions from the base ''I'' instruction set, excluding loads, +// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM +// instructions. Compressed forms of the aforementioned ''I'' instructions in +// the Zca and Zcb extensions are also permitted. +// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, +// and success is implementation specific. For implementations which know that +// non-base instructions (such as the ''B'' extension) will not violate any +// forward progress guarantees, using these instructions to reduce the LR/SC +// sequence length is desirable. +def FeaturePermissiveZalrsc + : SubtargetFeature< + "permissive-zalrsc", "HasPermissiveZalrsc", "true", + "Implementation permits non-base instructions between LR/SC pairs">; + +def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", + "AllowTaggedGlobals", + "true", "Use an instruction sequence for taking the address of a global " + "that allows a memory tag in the upper address bits">; + +//===----------------------------------------------------------------------===// +// Tuning features +//===----------------------------------------------------------------------===// + def TuneNLogNVRGather : SubtargetFeature<"log-vrgather", "RISCVVRGatherCostModel", "NLog2N", "Has vrgather.vv with LMUL*log2(LMUL) latency">; @@ -1846,23 +1869,44 @@ def TuneNoDefaultUnroll : SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false", "Disable default unroll preference.">; -// SiFive 7 is able to fuse integer ALU operations with a preceding branch -// instruction. -def TuneShortForwardBranchOpt - : SubtargetFeature<"short-forward-branch-opt", "HasShortForwardBranchOpt", - "true", "Enable short forward branch optimization">; -def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">; -def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +// Many Microarchitectures are able to fuse a branch over a single instruction +// with the branched-over instruction. We call this fusion "short forward +// branches". +// +// We can do this for a variety of instruction groups, depending on the +// microarch. We broadly group these by their scheduler class: +// - IALU: RVI Integer instructions, plus ANDN/ORN/XNOR (Zbb/Zbkb) +// - IMinMax: Zbb MIN(U)/MAX(U) +// - IMul: MUL +// +// We make the simplifying assumption that any microarches that implement +// any "short forward branches" can do the IALU fusions, and can opt into +// the other fusions they implement. +// +// The important Pseudo used by all these instructions requires the IALU +// short forward branches. +// +// Vendor-specific short-forward-branch opts may be added under IALU, as +// the vendor-specific instructions should only be enabled for vendor +// cores. +def TuneShortForwardBranchIALU + : SubtargetFeature<"short-forward-branch-ialu", "HasShortForwardBranchIALU", + "true", "Enable short forward branch optimization for RVI base instructions">; +def HasShortForwardBranchIALU : Predicate<"Subtarget->hasShortForwardBranchIALU()">; +def NoShortForwardBranch : Predicate<"!Subtarget->hasShortForwardBranchIALU()">; def TuneShortForwardBranchIMinMax - : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax", - "true", "Enable short forward branch optimization for min,max instructions in Zbb", - [TuneShortForwardBranchOpt]>; + : SubtargetFeature<"short-forward-branch-iminmax", "HasShortForwardBranchIMinMax", + "true", "Enable short forward branch optimization for MIN,MAX instructions in Zbb", + [TuneShortForwardBranchIALU]>; +def HasShortForwardBranchIMinMax : Predicate<"Subtarget->hasShortForwardBranchIMinMax()">; def TuneShortForwardBranchIMul - : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", - "true", "Enable short forward branch optimization for mul instruction", - [TuneShortForwardBranchOpt]>; + : SubtargetFeature<"short-forward-branch-imul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for MUL instruction", + [TuneShortForwardBranchIALU]>; +def HasShortForwardBranchIMul : Predicate<"Subtarget->hasShortForwardBranchIMul()">; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. @@ -1886,19 +1930,6 @@ def TuneHasSingleElementVecFP64 "Certain vector FP64 operations produce a single result " "element per cycle">; -def TuneMIPSP8700 - : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700", - "MIPS p8700 processor">; - -def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", - "SiFive 7-Series processors">; - -def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron", - "Ventana Veyron-Series processors">; - -def TuneAndes45 : SubtargetFeature<"andes45", "RISCVProcFamily", "Andes45", - "Andes 45-Series processors">; - def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush", "true", "VXRM writes causes pipeline flush">; @@ -1908,37 +1939,20 @@ def TunePreferVsetvliOverReadVLENB "true", "Prefer vsetvli over read vlenb CSR to calculate VLEN">; -// Assume that lock-free native-width atomics are available, even if the target -// and operating system combination would not usually provide them. The user -// is responsible for providing any necessary __sync implementations. Code -// built with this feature is not ABI-compatible with code built without this -// feature, if atomic variables are exposed across the ABI boundary. -def FeatureForcedAtomics : SubtargetFeature< - "forced-atomics", "HasForcedAtomics", "true", - "Assume that lock-free native-width atomics are available">; -def HasAtomicLdSt - : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; +//===----------------------------------------------------------------------===// +// CPU Families (alphabetized by vendor). +//===----------------------------------------------------------------------===// -// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) -// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] -// in section 13.3. Eventual Success of Store-Conditional Instructions, defines -// _constrained_ LR/SC loops: -// The dynamic code executed between the LR and SC instructions can only -// contain instructions from the base ''I'' instruction set, excluding loads, -// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM -// instructions. Compressed forms of the aforementioned ''I'' instructions in -// the Zca and Zcb extensions are also permitted. -// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, -// and success is implementation specific. For implementations which know that -// non-base instructions (such as the ''B'' extension) will not violate any -// forward progress guarantees, using these instructions to reduce the LR/SC -// sequence length is desirable. -def FeaturePermissiveZalrsc - : SubtargetFeature< - "permissive-zalrsc", "HasPermissiveZalrsc", "true", - "Implementation permits non-base instructions between LR/SC pairs">; +def TuneAndes45 : SubtargetFeature<"andes45", "RISCVProcFamily", "Andes45", + "Andes 45-Series processors">; + +def TuneMIPSP8700 + : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700", + "MIPS p8700 processor">; + +def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", + "SiFive 7-Series processors">; + +def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron", + "Ventana Veyron-Series processors">; -def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", - "AllowTaggedGlobals", - "true", "Use an instruction sequence for taking the address of a global " - "that allows a memory tag in the upper address bits">; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index f881c4c..668bb84 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -291,12 +291,12 @@ static void emitSiFiveCLICPreemptibleSaves(MachineFunction &MF, // which affects other passes. TII->storeRegToStackSlot(MBB, MBBI, RISCV::X8, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); TII->storeRegToStackSlot(MBB, MBBI, RISCV::X9, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); // Put `mcause` into X8 (s0), and `mepc` into X9 (s1). If either of these are // used in the function, then they will appear in `getUnmanagedCSI` and will @@ -357,14 +357,12 @@ static void emitSiFiveCLICPreemptibleRestores(MachineFunction &MF, // X8 and X9 need to be restored to their values on function entry, which we // saved onto the stack in `emitSiFiveCLICPreemptibleSaves`. - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X9, - RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X8, - RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X9, RVFI->getInterruptCSRFrameIndex(1), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X8, RVFI->getInterruptCSRFrameIndex(0), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); } // Get the ID of the libcall used for spilling and restoring callee saved @@ -1994,17 +1992,17 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const { bool RISCVFrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const { + std::vector<CalleeSavedInfo> &CSI) const { auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); // Preemptible Interrupts have two additional Callee-save Frame Indexes, // not tracked by `CSI`. if (RVFI->isSiFivePreemptibleInterrupt(MF)) { for (int I = 0; I < 2; ++I) { int FI = RVFI->getInterruptCSRFrameIndex(I); - MinCSFrameIndex = std::min<unsigned>(MinCSFrameIndex, FI); - MaxCSFrameIndex = std::max<unsigned>(MaxCSFrameIndex, FI); + MFI.setIsCalleeSavedObjectIndex(FI, true); } } @@ -2030,9 +2028,6 @@ bool RISCVFrameLowering::assignCalleeSavedSpillSlots( } } - MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - for (auto &CS : CSI) { MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -2082,10 +2077,7 @@ bool RISCVFrameLowering::assignCalleeSavedSpillSlots( // min. Alignment = std::min(Alignment, getStackAlign()); int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); - if ((unsigned)FrameIdx < MinCSFrameIndex) - MinCSFrameIndex = FrameIdx; - if ((unsigned)FrameIdx > MaxCSFrameIndex) - MaxCSFrameIndex = FrameIdx; + MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); CS.setFrameIdx(FrameIdx); if (RISCVRegisterInfo::isRVVRegClass(RC)) MFI.setStackID(FrameIdx, TargetStackID::ScalableVector); @@ -2177,7 +2169,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), - CS.getFrameIdx(), RC, TRI, Register(), + CS.getFrameIdx(), RC, Register(), MachineInstr::FrameSetup); } }; @@ -2267,8 +2259,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : CSInfo) { MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameDestroy); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register(), + MachineInstr::FrameDestroy); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } @@ -2509,3 +2501,12 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, } } } + +int RISCVFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { + return 0; +} + +Register +RISCVFrameLowering::getInitialCFARegister(const MachineFunction &MF) const { + return RISCV::X2; +} diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 6af63a4..84e48db 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -23,6 +23,9 @@ class RISCVFrameLowering : public TargetFrameLowering { public: explicit RISCVFrameLowering(const RISCVSubtarget &STI); + int getInitialCFAOffset(const MachineFunction &MF) const override; + Register getInitialCFARegister(const MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; @@ -44,11 +47,10 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; - bool assignCalleeSavedSpillSlots(MachineFunction &MF, - const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const override; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index eba35ef..67d2cac 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -17,14 +17,14 @@ include "RISCV.td" include "RISCVCombine.td" def simm12Plus1 : ImmLeaf<XLenVT, [{ - return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>; + return Imm >= -2047 && Imm <= 2048;}]>; def simm12Plus1i32 : ImmLeaf<i32, [{ - return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>; + return Imm >= -2047 && Imm <= 2048;}]>; // FIXME: This doesn't check that the G_CONSTANT we're deriving the immediate // from is only used once def simm12Minus1Nonzero : ImmLeaf<XLenVT, [{ - return (Imm >= -2049 && Imm < 0) || (Imm > 0 && Imm <= 2046);}]>; + return Imm >= -2049 && Imm <= 2046 && Imm != 0;}]>; def simm12Minus1NonzeroNonNeg1 : ImmLeaf<XLenVT, [{ return (Imm >= -2049 && Imm < -1) || (Imm > 0 && Imm <= 2046);}]>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9078335..8bfdbef 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -51,6 +51,8 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() { SDValue Result; switch (N->getOpcode()) { case ISD::SPLAT_VECTOR: { + if (Subtarget->enablePExtCodeGen()) + break; // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden. MVT VT = N->getSimpleValueType(0); @@ -991,6 +993,18 @@ static unsigned getSegInstNF(unsigned Intrinsic) { } } +static bool isApplicableToPLI(int Val) { + // Check if the immediate is packed i8 or i10 + int16_t Bit31To16 = Val >> 16; + int16_t Bit15To0 = Val; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Val; + if (Bit31To16 != Bit15To0) + return false; + + return isInt<10>(Bit31To16) || Bit15To8 == Bit7To0; +} + void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { @@ -1034,6 +1048,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node)) Imm = SignExtend64<32>(Imm); + if (Subtarget->enablePExtCodeGen() && isApplicableToPLI(Imm) && + hasAllWUsers(Node)) { + // If it's 4 packed 8-bit integers or 2 packed signed 16-bit integers, we + // can simply copy lower 32 bits to higher 32 bits to make it able to + // rematerialize to PLI_B or PLI_H + Imm = ((uint64_t)Imm << 32) | (Imm & 0xFFFFFFFF); + } + ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget).getNode()); return; } @@ -1478,8 +1500,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (tryUnsignedBitfieldInsertInZero(Node, DL, VT, X, Msb, Lsb)) return; - // (srli (slli c2+c3), c3) if (OneUseOrZExtW && !IsCANDI) { + // (packh x0, X) + if (Subtarget->hasStdExtZbkb() && C1 == 0xff00 && C2 == 8) { + SDNode *PACKH = CurDAG->getMachineNode( + RISCV::PACKH, DL, VT, + CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT()), X); + ReplaceNode(Node, PACKH); + return; + } + // (srli (slli c2+c3), c3) SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, X, CurDAG->getTargetConstant(C2 + Leading, DL, VT)); @@ -1845,6 +1875,43 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } + case RISCVISD::PPACK_DH: { + assert(Subtarget->enablePExtCodeGen() && Subtarget->isRV32()); + + SDValue Val0 = Node->getOperand(0); + SDValue Val1 = Node->getOperand(1); + SDValue Val2 = Node->getOperand(2); + SDValue Val3 = Node->getOperand(3); + + SDValue Ops[] = { + CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), Val0, + CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Val2, + CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)}; + SDValue RegPair0 = + SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + MVT::Untyped, Ops), + 0); + SDValue Ops1[] = { + CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), Val1, + CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Val3, + CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)}; + SDValue RegPair1 = + SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + MVT::Untyped, Ops1), + 0); + + MachineSDNode *PackDH = CurDAG->getMachineNode( + RISCV::PPAIRE_DB, DL, MVT::Untyped, {RegPair0, RegPair1}); + + SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL, + MVT::i32, SDValue(PackDH, 0)); + SDValue Hi = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_odd, DL, + MVT::i32, SDValue(PackDH, 0)); + ReplaceUses(SDValue(Node, 0), Lo); + ReplaceUses(SDValue(Node, 1), Hi); + CurDAG->RemoveDeadNode(Node); + return; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = Node->getConstantOperandVal(0); switch (IntNo) { @@ -2654,8 +2721,34 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } + if (Subtarget->enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16)) || + (SrcVT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i16 || + SrcVT == MVT::v2i32)) || + (SrcVT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) { + ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } break; } + case ISD::SCALAR_TO_VECTOR: + if (Subtarget->enablePExtCodeGen()) { + MVT SrcVT = Node->getOperand(0).getSimpleValueType(); + if ((VT == MVT::v2i32 && SrcVT == MVT::i64) || + (VT == MVT::v4i8 && SrcVT == MVT::i32)) { + ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } + break; case ISD::INSERT_SUBVECTOR: case RISCVISD::TUPLE_INSERT: { SDValue V = Node->getOperand(0); @@ -4230,14 +4323,14 @@ bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) { bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1(SDValue N, SDValue &SplatVal) { return selectVSplatImmHelper( N, SplatVal, *CurDAG, *Subtarget, - [](int64_t Imm) { return (isInt<5>(Imm) && Imm != -16) || Imm == 16; }, + [](int64_t Imm) { return Imm >= -15 && Imm <= 16; }, /*Decrement=*/true); } bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NoDec(SDValue N, SDValue &SplatVal) { return selectVSplatImmHelper( N, SplatVal, *CurDAG, *Subtarget, - [](int64_t Imm) { return (isInt<5>(Imm) && Imm != -16) || Imm == 16; }, + [](int64_t Imm) { return Imm >= -15 && Imm <= 16; }, /*Decrement=*/false); } @@ -4245,9 +4338,7 @@ bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NonZero(SDValue N, SDValue &SplatVal) { return selectVSplatImmHelper( N, SplatVal, *CurDAG, *Subtarget, - [](int64_t Imm) { - return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16); - }, + [](int64_t Imm) { return Imm != 0 && Imm >= -15 && Imm <= 16; }, /*Decrement=*/true); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a3ccbd8..7cbb9c0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -88,13 +88,15 @@ static cl::opt<bool> cl::init(true)); // TODO: Support more ops -static const unsigned ZvfbfaVPOps[] = {ISD::VP_FNEG, ISD::VP_FABS, - ISD::VP_FCOPYSIGN}; -static const unsigned ZvfbfaOps[] = {ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN}; +static const unsigned ZvfbfaVPOps[] = { + ISD::VP_FNEG, ISD::VP_FABS, ISD::VP_FCOPYSIGN}; +static const unsigned ZvfbfaOps[] = { + ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN, ISD::SPLAT_VECTOR, + ISD::FADD, ISD::FSUB, ISD::FMUL}; RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) - : TargetLowering(TM), Subtarget(STI) { + : TargetLowering(TM, STI), Subtarget(STI) { RISCVABI::ABI ABI = Subtarget.getTargetABI(); assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI"); @@ -284,6 +286,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass); } + // fixed vector is stored in GPRs for P extension packed operations + if (Subtarget.enablePExtCodeGen()) { + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } else { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -323,9 +337,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); - if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() && - !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() && - !Subtarget.hasVendorXAndesPerf() && + if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() && + !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() && !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand); @@ -398,7 +411,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); } - if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() || + if (Subtarget.hasStdExtZbb() || (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) { setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT, Legal); @@ -409,9 +422,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } else { setOperationAction(ISD::CTTZ, XLenVT, Expand); - // If have a CLZW, but not CTZW, custom promote i32. - if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) - setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } if (!Subtarget.hasCPOPLike()) { @@ -440,7 +450,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ABS, XLenVT, Legal); if (Subtarget.is64Bit()) setOperationAction(ISD::ABS, MVT::i32, Custom); - } else if (Subtarget.hasShortForwardBranchOpt()) { + } else if (Subtarget.hasShortForwardBranchIALU()) { // We can use PseudoCCSUB to implement ABS. setOperationAction(ISD::ABS, XLenVT, Legal); } else if (Subtarget.is64Bit()) { @@ -492,6 +502,35 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; + if (Subtarget.enablePExtCodeGen()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + SmallVector<MVT, 2> VTs; + if (Subtarget.is64Bit()) { + VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + } else { + VTs.append({MVT::v2i16, MVT::v4i8}); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); + } + setOperationAction(ISD::UADDSAT, VTs, Legal); + setOperationAction(ISD::SADDSAT, VTs, Legal); + setOperationAction(ISD::USUBSAT, VTs, Legal); + setOperationAction(ISD::SSUBSAT, VTs, Legal); + setOperationAction(ISD::SSHLSAT, VTs, Legal); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal); + setOperationAction(ISD::SPLAT_VECTOR, VTs, Legal); + setOperationAction(ISD::SHL, VTs, Custom); + setOperationAction(ISD::BITCAST, VTs, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom); + } + if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); @@ -756,8 +795,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, - ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF, - ISD::EXPERIMENTAL_VP_SPLAT}; + ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, @@ -772,7 +810,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FRINT, ISD::VP_FNEARBYINT, ISD::VP_IS_FPCLASS, ISD::VP_FMINIMUM, ISD::VP_FMAXIMUM, ISD::VP_LRINT, ISD::VP_LLRINT, ISD::VP_REDUCE_FMINIMUM, - ISD::VP_REDUCE_FMAXIMUM, ISD::EXPERIMENTAL_VP_SPLAT}; + ISD::VP_REDUCE_FMAXIMUM}; static const unsigned IntegerVecReduceOps[] = { ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, @@ -866,7 +904,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom); setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom); - setOperationAction(ISD::EXPERIMENTAL_VP_SPLAT, VT, Custom); setOperationPromotedToType( ISD::VECTOR_SPLICE, VT, @@ -1049,6 +1086,36 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM}; + // TODO: Make more of these ops legal. + static const unsigned ZvfbfaPromoteOps[] = {ISD::FMINNUM, + ISD::FMAXNUM, + ISD::FMINIMUMNUM, + ISD::FMAXIMUMNUM, + ISD::FDIV, + ISD::FMA, + ISD::FSQRT, + ISD::FCEIL, + ISD::FTRUNC, + ISD::FFLOOR, + ISD::FROUND, + ISD::FROUNDEVEN, + ISD::FRINT, + ISD::FNEARBYINT, + ISD::IS_FPCLASS, + ISD::SETCC, + ISD::FMAXIMUM, + ISD::FMINIMUM, + ISD::STRICT_FADD, + ISD::STRICT_FSUB, + ISD::STRICT_FMUL, + ISD::STRICT_FDIV, + ISD::STRICT_FSQRT, + ISD::STRICT_FMA, + ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAX, + ISD::VECREDUCE_FMINIMUM, + ISD::VECREDUCE_FMAXIMUM}; + // TODO: support more vp ops. static const unsigned ZvfhminZvfbfminPromoteVPOps[] = { ISD::VP_FADD, @@ -1181,12 +1248,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom); MVT EltVT = VT.getVectorElementType(); if (isTypeLegal(EltVT)) - setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT, - ISD::EXTRACT_VECTOR_ELT}, + setOperationAction({ISD::SPLAT_VECTOR, ISD::EXTRACT_VECTOR_ELT}, VT, Custom); else - setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT}, - EltVT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, EltVT, Custom); setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, @@ -1226,26 +1291,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom); - setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS, - ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, - ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE, - ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE, - ISD::VECTOR_COMPRESS}, + setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, + ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, + ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_DEINTERLEAVE, + ISD::VECTOR_INTERLEAVE, ISD::VECTOR_REVERSE, + ISD::VECTOR_SPLICE, ISD::VECTOR_COMPRESS}, VT, Custom); setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom); setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Legal); + setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ZvfbfaVPOps, VT, Custom); - MVT EltVT = VT.getVectorElementType(); - if (isTypeLegal(EltVT)) - setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT, - ISD::EXTRACT_VECTOR_ELT}, - VT, Custom); - else - setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT}, - EltVT, Custom); setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, @@ -1259,11 +1317,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal. if (getLMUL(VT) == RISCVVType::LMUL_8) { - setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); + setOperationAction(ZvfbfaPromoteOps, VT, Custom); setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); } else { MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); - setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfbfaPromoteOps, VT, F32VecVT); setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); } }; @@ -1580,8 +1638,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // TODO: could split the f16 vector into two vectors and do promotion. if (!isTypeLegal(F32VecVT)) continue; - setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); - // TODO: Promote VP ops to fp32. + + if (Subtarget.hasStdExtZvfbfa()) + setOperationPromotedToType(ZvfbfaPromoteOps, VT, F32VecVT); + else + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); continue; } @@ -1776,6 +1838,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false); } +TargetLoweringBase::LegalizeTypeAction +RISCVTargetLowering::getPreferredVectorAction(MVT VT) const { + if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) + if (VT == MVT::v2i16 || VT == MVT::v4i8) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const { @@ -1824,7 +1895,7 @@ bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const { } bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, + const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const { auto &DL = I.getDataLayout(); @@ -2493,7 +2564,7 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, // TODO: For sizes which aren't multiples of VLEN sizes, this may not be // a cheap extract. However, this case is important in practice for // shuffled extracts of longer vectors. How resolve? - return (ResElts * 2) == SrcElts && (Index == 0 || Index == ResElts); + return (ResElts * 2) == SrcElts && Index == ResElts; } MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, @@ -2505,9 +2576,7 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, !Subtarget.hasStdExtZfhminOrZhinxmin()) return MVT::f32; - MVT PartVT = TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); - - return PartVT; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned @@ -2533,15 +2602,6 @@ unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } -unsigned RISCVTargetLowering::getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, - unsigned &NumIntermediates, MVT &RegisterVT) const { - unsigned NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv( - Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); - - return NumRegs; -} - // Changes the condition code and swaps operands if necessary, so the SetCC // operation matches one of the comparisons supported directly by branches // in the RISC-V ISA. May adjust compares to favor compare with 0 over compare @@ -4392,6 +4452,33 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDLoc DL(Op); + if (Subtarget.isRV32() && Subtarget.enablePExtCodeGen()) { + if (VT != MVT::v4i8) + return SDValue(); + + // <4 x i8> BUILD_VECTOR a, b, c, d -> PACK(PPACK.DH pair(a, b), pair(c, d)) + SDValue Val0 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(0)); + SDValue Val1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(1)); + SDValue Val2 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(2)); + SDValue Val3 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(3)); + SDValue PackDH = + DAG.getNode(RISCVISD::PPACK_DH, DL, {MVT::v2i16, MVT::v2i16}, + {Val0, Val1, Val2, Val3}); + + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i8, + SDValue( + DAG.getMachineNode( + RISCV::PACK, DL, MVT::i32, + {DAG.getNode(ISD::BITCAST, DL, MVT::i32, PackDH.getValue(0)), + DAG.getNode(ISD::BITCAST, DL, MVT::i32, PackDH.getValue(1))}), + 0)); + } + // Proper support for f16 requires Zvfh. bf16 always requires special // handling. We need to cast the scalar to integer and create an integer // build_vector. @@ -4793,7 +4880,7 @@ static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL, if (VT.isFloatingPoint()) { if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) || - EltVT == MVT::bf16) { + (EltVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) { if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) || (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin())) Scalar = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Scalar); @@ -6773,6 +6860,99 @@ SDValue RISCVTargetLowering::expandUnalignedRVVStore(SDValue Op, Store->getMemOperand()->getFlags()); } +// While RVV has alignment restrictions, we should always be able to load as a +// legal equivalently-sized byte-typed vector instead. This method is +// responsible for re-expressing a ISD::VP_LOAD via a correctly-aligned type. If +// the load is already correctly-aligned, it returns SDValue(). +SDValue RISCVTargetLowering::expandUnalignedVPLoad(SDValue Op, + SelectionDAG &DAG) const { + auto *Load = cast<VPLoadSDNode>(Op); + assert(Load && Load->getMemoryVT().isVector() && "Expected vector load"); + + if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + Load->getMemoryVT(), + *Load->getMemOperand())) + return SDValue(); + + SDValue Mask = Load->getMask(); + + // FIXME: Handled masked loads somehow. + if (!ISD::isConstantSplatVectorAllOnes(Mask.getNode())) + return SDValue(); + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + unsigned EltSizeBits = VT.getScalarSizeInBits(); + assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) && + "Unexpected unaligned RVV load type"); + MVT NewVT = + MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8)); + assert(NewVT.isValid() && + "Expecting equally-sized RVV vector types to be legal"); + + SDValue VL = Load->getVectorLength(); + VL = DAG.getNode(ISD::MUL, DL, VL.getValueType(), VL, + DAG.getConstant((EltSizeBits / 8), DL, VL.getValueType())); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, NewVT.getVectorElementCount()); + SDValue L = DAG.getLoadVP(NewVT, DL, Load->getChain(), Load->getBasePtr(), + DAG.getAllOnesConstant(DL, MaskVT), VL, + Load->getPointerInfo(), Load->getBaseAlign(), + Load->getMemOperand()->getFlags(), AAMDNodes()); + return DAG.getMergeValues({DAG.getBitcast(VT, L), L.getValue(1)}, DL); +} + +// While RVV has alignment restrictions, we should always be able to store as a +// legal equivalently-sized byte-typed vector instead. This method is +// responsible for re-expressing a ISD::VP STORE via a correctly-aligned type. +// It returns SDValue() if the store is already correctly aligned. +SDValue RISCVTargetLowering::expandUnalignedVPStore(SDValue Op, + SelectionDAG &DAG) const { + auto *Store = cast<VPStoreSDNode>(Op); + assert(Store && Store->getValue().getValueType().isVector() && + "Expected vector store"); + + if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + Store->getMemoryVT(), + *Store->getMemOperand())) + return SDValue(); + + SDValue Mask = Store->getMask(); + + // FIXME: Handled masked stores somehow. + if (!ISD::isConstantSplatVectorAllOnes(Mask.getNode())) + return SDValue(); + + SDLoc DL(Op); + SDValue StoredVal = Store->getValue(); + MVT VT = StoredVal.getSimpleValueType(); + unsigned EltSizeBits = VT.getScalarSizeInBits(); + assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) && + "Unexpected unaligned RVV store type"); + MVT NewVT = + MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8)); + assert(NewVT.isValid() && + "Expecting equally-sized RVV vector types to be legal"); + + SDValue VL = Store->getVectorLength(); + VL = DAG.getNode(ISD::MUL, DL, VL.getValueType(), VL, + DAG.getConstant((EltSizeBits / 8), DL, VL.getValueType())); + + StoredVal = DAG.getBitcast(NewVT, StoredVal); + + LocationSize Size = LocationSize::precise(NewVT.getStoreSize()); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + Store->getPointerInfo(), Store->getMemOperand()->getFlags(), Size, + Store->getBaseAlign()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, NewVT.getVectorElementCount()); + return DAG.getStoreVP(Store->getChain(), DL, StoredVal, Store->getBasePtr(), + DAG.getUNDEF(Store->getBasePtr().getValueType()), + DAG.getAllOnesConstant(DL, MaskVT), VL, NewVT, MMO, + ISD::UNINDEXED); +} + static SDValue lowerConstant(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(Op.getValueType() == MVT::i64 && "Unexpected VT"); @@ -7546,6 +7726,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } + if (Subtarget.enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) || + (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 || + Op0VT == MVT::v2i32)) || + (Op0VT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) + return Op; + } + // Consider other scalar<->scalar casts as legal if the types are legal. // Otherwise expand them. if (!VT.isVector() && !Op0VT.isVector()) { @@ -8218,6 +8411,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, auto *Store = cast<StoreSDNode>(Op); SDValue StoredVal = Store->getValue(); EVT VT = StoredVal.getValueType(); + if (Subtarget.enablePExtCodeGen()) { + if (VT == MVT::v2i16 || VT == MVT::v4i8) { + SDValue DL(Op); + SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal); + SDValue NewStore = + DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(), + Store->getPointerInfo(), Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + return NewStore; + } + } if (VT == MVT::f64) { assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -8245,7 +8449,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (Store->isTruncatingStore()) return SDValue(); - if (!Subtarget.enableUnalignedScalarMem() && Store->getAlign() < 8) + if (Store->getAlign() < Subtarget.getZilsdAlign()) return SDValue(); SDLoc DL(Op); @@ -8304,13 +8508,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerFixedLengthVectorStoreToRVV(Op, DAG); return Op; } - case ISD::MLOAD: case ISD::VP_LOAD: + if (SDValue V = expandUnalignedVPLoad(Op, DAG)) + return V; + [[fallthrough]]; + case ISD::MLOAD: return lowerMaskedLoad(Op, DAG); case ISD::VP_LOAD_FF: return lowerLoadFF(Op, DAG); - case ISD::MSTORE: case ISD::VP_STORE: + if (SDValue V = expandUnalignedVPStore(Op, DAG)) + return V; + [[fallthrough]]; + case ISD::MSTORE: return lowerMaskedStore(Op, DAG); case ISD::VECTOR_COMPRESS: return lowerVectorCompress(Op, DAG); @@ -8398,6 +8608,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VSELECT: return lowerToScalableOp(Op, DAG); case ISD::SHL: + if (Subtarget.enablePExtCodeGen() && + Op.getSimpleValueType().isFixedLengthVector()) { + // We have patterns for scalar/immediate shift amount, so no lowering + // needed. + if (Op.getOperand(1)->getOpcode() == ISD::SPLAT_VECTOR) + return Op; + + // There's no vector-vector version of shift instruction in P extension so + // we need to unroll to scalar computation and pack them back. + return DAG.UnrollVectorOp(Op.getNode()); + } + [[fallthrough]]; case ISD::SRA: case ISD::SRL: if (Op.getSimpleValueType().isFixedLengthVector()) @@ -8654,8 +8876,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerVPSpliceExperimental(Op, DAG); case ISD::EXPERIMENTAL_VP_REVERSE: return lowerVPReverseExperimental(Op, DAG); - case ISD::EXPERIMENTAL_VP_SPLAT: - return lowerVPSplatExperimental(Op, DAG); case ISD::CLEAR_CACHE: { assert(getTargetMachine().getTargetTriple().isOSLinux() && "llvm.clear_cache only needs custom lower on Linux targets"); @@ -9315,7 +9535,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, static SDValue foldBinOpIntoSelectIfProfitable(SDNode *BO, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - if (Subtarget.hasShortForwardBranchOpt()) + if (Subtarget.hasShortForwardBranchIALU()) return SDValue(); unsigned SelOpNo = 0; @@ -9390,6 +9610,50 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue V = lowerSelectToBinOp(Op.getNode(), DAG, Subtarget)) return V; + // When there is no cost for GPR <-> FPR, we can use zicond select for + // floating value when CondV is int type + bool FPinGPR = Subtarget.hasStdExtZfinx(); + + // We can handle FGPR without spliting into hi/lo parts + bool FitsInGPR = TypeSize::isKnownLE(VT.getSizeInBits(), + Subtarget.getXLenVT().getSizeInBits()); + + bool UseZicondForFPSel = Subtarget.hasStdExtZicond() && FPinGPR && + VT.isFloatingPoint() && FitsInGPR; + + if (UseZicondForFPSel) { + + auto CastToInt = [&](SDValue V) -> SDValue { + // Treat +0.0 as int 0 to enable single 'czero' instruction generation. + if (isNullFPConstant(V)) + return DAG.getConstant(0, DL, XLenVT); + + if (VT == MVT::f16) + return DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, V); + + if (VT == MVT::f32 && Subtarget.is64Bit()) + return DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, XLenVT, V); + + return DAG.getBitcast(XLenVT, V); + }; + + SDValue TrueVInt = CastToInt(TrueV); + SDValue FalseVInt = CastToInt(FalseV); + + // Emit integer SELECT (lowers to Zicond) + SDValue ResultInt = + DAG.getNode(ISD::SELECT, DL, XLenVT, CondV, TrueVInt, FalseVInt); + + // Convert back to floating VT + if (VT == MVT::f32 && Subtarget.is64Bit()) + return DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, VT, ResultInt); + + if (VT == MVT::f16) + return DAG.getNode(RISCVISD::FMV_H_X, DL, VT, ResultInt); + + return DAG.getBitcast(VT, ResultInt); + } + // When Zicond or XVentanaCondOps is present, emit CZERO_EQZ and CZERO_NEZ // nodes to implement the SELECT. Performing the lowering here allows for // greater control over when CZERO_{EQZ/NEZ} are used vs another branchless @@ -10254,7 +10518,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, } if ((ValVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) || - ValVT == MVT::bf16) { + (ValVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) { // If we don't have vfmv.s.f for f16/bf16, use fmv.x.h first. MVT IntVT = VecVT.changeTypeToInteger(); SDValue IntInsert = DAG.getNode( @@ -10491,7 +10755,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, } if ((EltVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) || - EltVT == MVT::bf16) { + (EltVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) { // If we don't have vfmv.f.s for f16/bf16, extract to a gpr then use fmv.h.x MVT IntVT = VecVT.changeTypeToInteger(); SDValue IntVec = DAG.getBitcast(IntVT, Vec); @@ -10500,6 +10764,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract); } + if (Subtarget.enablePExtCodeGen() && VecVT.isFixedLengthVector()) { + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + SDValue Extracted = DAG.getBitcast(XLenVT, Vec); + unsigned ElemWidth = VecVT.getVectorElementType().getSizeInBits(); + SDValue Shamt = DAG.getNode(ISD::MUL, DL, XLenVT, Idx, + DAG.getConstant(ElemWidth, DL, XLenVT)); + return DAG.getNode(ISD::SRL, DL, XLenVT, Extracted, Shamt); + } + // If this is a fixed vector, we need to convert it to a scalable vector. MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { @@ -12534,10 +12809,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, SmallVector<SDValue, 8> Loads(Factor); - SDValue Increment = - DAG.getVScale(DL, PtrVT, - APInt(PtrVT.getFixedSizeInBits(), - VecVT.getStoreSize().getKnownMinValue())); + SDValue Increment = DAG.getTypeSize(DL, PtrVT, VecVT.getStoreSize()); for (unsigned i = 0; i != Factor; ++i) { if (i != 0) StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment); @@ -13823,47 +14095,6 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op, return convertFromScalableVector(VT, Result, DAG, Subtarget); } -SDValue RISCVTargetLowering::lowerVPSplatExperimental(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Val = Op.getOperand(0); - SDValue Mask = Op.getOperand(1); - SDValue VL = Op.getOperand(2); - MVT VT = Op.getSimpleValueType(); - - MVT ContainerVT = VT; - if (VT.isFixedLengthVector()) { - ContainerVT = getContainerForFixedLengthVector(VT); - MVT MaskVT = getMaskTypeFor(ContainerVT); - Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); - } - - SDValue Result; - if (VT.getScalarType() == MVT::i1) { - if (auto *C = dyn_cast<ConstantSDNode>(Val)) { - Result = - DAG.getNode(C->isZero() ? RISCVISD::VMCLR_VL : RISCVISD::VMSET_VL, DL, - ContainerVT, VL); - } else { - MVT WidenVT = ContainerVT.changeVectorElementType(MVT::i8); - SDValue LHS = - DAG.getNode(RISCVISD::VMV_V_X_VL, DL, WidenVT, DAG.getUNDEF(WidenVT), - DAG.getZExtOrTrunc(Val, DL, Subtarget.getXLenVT()), VL); - SDValue RHS = DAG.getConstant(0, DL, WidenVT); - Result = DAG.getNode(RISCVISD::SETCC_VL, DL, ContainerVT, - {LHS, RHS, DAG.getCondCode(ISD::SETNE), - DAG.getUNDEF(ContainerVT), Mask, VL}); - } - } else { - Result = - lowerScalarSplat(SDValue(), Val, VL, ContainerVT, DL, DAG, Subtarget); - } - - if (!VT.isFixedLengthVector()) - return Result; - return convertFromScalableVector(VT, Result, DAG, Subtarget); -} - SDValue RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const { @@ -13935,9 +14166,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op, // Slide off any elements from past EVL that were reversed into the low // elements. - unsigned MinElts = GatherVT.getVectorMinNumElements(); SDValue VLMax = - DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), MinElts)); + DAG.getElementCount(DL, XLenVT, GatherVT.getVectorElementCount()); SDValue Diff = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, EVL); Result = getVSlidedown(DAG, Subtarget, DL, GatherVT, @@ -14627,7 +14857,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, assert(Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && "Unexpected custom legalisation"); - if (!Subtarget.enableUnalignedScalarMem() && Ld->getAlign() < 8) + if (Ld->getAlign() < Subtarget.getZilsdAlign()) return; SDLoc DL(N); @@ -14752,24 +14982,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, bool IsCTZ = N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF; - // Without Zbb, lower as 32 - clzw(~X & (X-1)) - if (IsCTZ && !Subtarget.hasStdExtZbb()) { - assert(Subtarget.hasStdExtP()); - - NewOp0 = DAG.getFreeze(NewOp0); - SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64); - SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0, - DAG.getConstant(1, DL, MVT::i64)); - SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1); - SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And); - SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64, - DAG.getConstant(32, DL, MVT::i64), CLZW); - SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub, - DAG.getValueType(MVT::i32)); - Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); - return; - } - unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW; SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); @@ -14997,6 +15209,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes)); break; } + case RISCVISD::PASUB: + case RISCVISD::PASUBU: { + MVT VT = N->getSimpleValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(VT == MVT::v2i16 || VT == MVT::v4i8); + MVT NewVT = MVT::v4i16; + if (VT == MVT::v4i8) + NewVT = MVT::v8i8; + SDValue Undef = DAG.getUNDEF(VT); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op0, Undef}); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op1, Undef}); + Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1})); + return; + } case ISD::EXTRACT_VECTOR_ELT: { // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element // type is illegal (currently only vXi64 RV32). @@ -16104,11 +16331,84 @@ static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::TRUNCATE, DL, VT, Min); } +// Handle P extension averaging subtraction pattern: +// (vXiY (trunc (srl (sub ([s|z]ext vXiY:$a), ([s|z]ext vXiY:$b)), 1))) +// -> PASUB/PASUBU +static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + if (N0.getOpcode() != ISD::SRL) + return SDValue(); + + MVT VecVT = VT.getSimpleVT(); + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + + // Check if shift amount is 1 + SDValue ShAmt = N0.getOperand(1); + if (ShAmt.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(ShAmt.getNode()); + if (!BV) + return SDValue(); + SDValue Splat = BV->getSplatValue(); + if (!Splat) + return SDValue(); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat); + if (!C) + return SDValue(); + if (C->getZExtValue() != 1) + return SDValue(); + + // Check for SUB operation + SDValue Sub = N0.getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return SDValue(); + + SDValue LHS = Sub.getOperand(0); + SDValue RHS = Sub.getOperand(1); + + // Check if both operands are sign/zero extends from the target + // type + bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND && + RHS.getOpcode() == ISD::SIGN_EXTEND; + bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSignExt && !IsZeroExt) + return SDValue(); + + SDValue A = LHS.getOperand(0); + SDValue B = RHS.getOperand(0); + + // Check if the extends are from our target vector type + if (A.getValueType() != VT || B.getValueType() != VT) + return SDValue(); + + // Determine the instruction based on type and signedness + unsigned Opc; + if (IsSignExt) + Opc = RISCVISD::PASUB; + else if (IsZeroExt) + Opc = RISCVISD::PASUBU; + else + return SDValue(); + + // Create the machine node directly + return DAG.getNode(Opc, SDLoc(N), VT, {A, B}); +} + static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (VT.isFixedLengthVector() && Subtarget.enablePExtCodeGen()) + return combinePExtTruncate(N, DAG, Subtarget); + // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero // extending X. This is safe since we only need the LSB after the shift and // shift amounts larger than 31 would produce poison. If we wait until @@ -16591,22 +16891,33 @@ static SDValue expandMulToNAFSequence(SDNode *N, SelectionDAG &DAG, static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt) { uint64_t MulAmtLowBit = MulAmt & (-MulAmt); + SDValue X = N->getOperand(0); ISD::NodeType Op; uint64_t ShiftAmt1; - if (isPowerOf2_64(MulAmt + MulAmtLowBit)) { - Op = ISD::SUB; - ShiftAmt1 = MulAmt + MulAmtLowBit; - } else if (isPowerOf2_64(MulAmt - MulAmtLowBit)) { + bool CanSub = isPowerOf2_64(MulAmt + MulAmtLowBit); + auto PreferSub = [X, MulAmtLowBit]() { + // For MulAmt == 3 << M both (X << M + 2) - (X << M) + // and (X << M + 1) + (X << M) are valid expansions. + // Prefer SUB if we can get (X << M + 2) for free, + // because X is exact (Y >> M + 2). + uint64_t ShAmt = Log2_64(MulAmtLowBit) + 2; + using namespace SDPatternMatch; + return sd_match(X, m_ExactSr(m_Value(), m_SpecificInt(ShAmt))); + }; + if (isPowerOf2_64(MulAmt - MulAmtLowBit) && !(CanSub && PreferSub())) { Op = ISD::ADD; ShiftAmt1 = MulAmt - MulAmtLowBit; + } else if (CanSub) { + Op = ISD::SUB; + ShiftAmt1 = MulAmt + MulAmtLowBit; } else { return SDValue(); } EVT VT = N->getValueType(0); SDLoc DL(N); - SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Log2_64(ShiftAmt1), DL, VT)); - SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT)); return DAG.getNode(Op, DL, VT, Shift1, Shift2); } @@ -16616,10 +16927,13 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, SDLoc DL(N); EVT VT = N->getValueType(0); SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the shift forming a slli.uw. + // Put the shift first if we can fold: + // a. a zext into the shift forming a slli.uw + // b. an exact shift right forming one shorter shift or no shift at all using namespace SDPatternMatch; if (Shift != 0 && - sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { + sd_match(X, m_AnyOf(m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))), + m_ExactSr(m_Value(), m_ConstInt())))) { X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); Shift = 0; } @@ -16660,12 +16974,23 @@ static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, break; } - // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X) int ShX; if (int ShY = isShifted359(MulAmt - 1, ShX)) { assert(ShX != 0 && "MulAmt=4,6,10 handled before"); + // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X) if (ShX <= 3) return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true, Shift); + // 2^N * 3/5/9 + 1 -> (add (shYadd (shl X, N), (shl X, N)), X) + if (Shift == 0) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShX, DL, VT)); + SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getTargetConstant(ShY, DL, VT), Shl); + return DAG.getNode(ISD::ADD, DL, VT, ShlAdd, X); + } } return SDValue(); } @@ -16726,7 +17051,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(Shift, DL, VT), Shift1); } - // TODO: 2^(C1>3) * 3,5,9 +/- 1 + // TODO: 2^(C1>3) * 3/5/9 - 1 // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { @@ -18076,8 +18401,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, } } for (std::pair<SDValue, SDValue> OldNewValues : ValuesToReplace) { - DAG.ReplaceAllUsesOfValueWith(OldNewValues.first, OldNewValues.second); - DCI.AddToWorklist(OldNewValues.second.getNode()); + DCI.CombineTo(OldNewValues.first.getNode(), OldNewValues.second); } return InputRootReplacement; } @@ -20534,7 +20858,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // Undo this and sink the fneg so we match more fmsub/fnmadd patterns. if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y)))))) return DAG.getNode(ISD::FNEG, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, X, Y)); + DAG.getNode(ISD::FMUL, DL, VT, X, Y, N->getFlags()), + N->getFlags()); // fmul X, (copysign 1.0, Y) -> fsgnjx X, Y SDValue N0 = N->getOperand(0); @@ -20655,7 +20980,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // (select (x < 0), y, z) -> x >> (XLEN - 1) & (y - z) + z // (select (x >= 0), y, z) -> x >> (XLEN - 1) & (z - y) + y - if (!Subtarget.hasShortForwardBranchOpt() && isa<ConstantSDNode>(TrueV) && + if (!Subtarget.hasShortForwardBranchIALU() && isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) && isNullConstant(RHS) && (CCVal == ISD::CondCode::SETLT || CCVal == ISD::CondCode::SETGE)) { if (CCVal == ISD::CondCode::SETGE) @@ -21514,6 +21839,49 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return N->getOperand(0); break; } + case RISCVISD::VMERGE_VL: { + // vmerge_vl allones, x, y, passthru, vl -> vmv_v_v passthru, x, vl + SDValue Mask = N->getOperand(0); + SDValue True = N->getOperand(1); + SDValue Passthru = N->getOperand(3); + SDValue VL = N->getOperand(4); + + // Fixed vectors are wrapped in scalable containers, unwrap them. + using namespace SDPatternMatch; + SDValue SubVec; + if (sd_match(Mask, m_InsertSubvector(m_Undef(), m_Value(SubVec), m_Zero()))) + Mask = SubVec; + + if (!isOneOrOneSplat(Mask)) + break; + + return DAG.getNode(RISCVISD::VMV_V_V_VL, SDLoc(N), N->getSimpleValueType(0), + Passthru, True, VL); + } + case RISCVISD::VMV_V_V_VL: { + // vmv_v_v passthru, splat(x), vl -> vmv_v_x passthru, x, vl + SDValue Passthru = N->getOperand(0); + SDValue Src = N->getOperand(1); + SDValue VL = N->getOperand(2); + + // Fixed vectors are wrapped in scalable containers, unwrap them. + using namespace SDPatternMatch; + SDValue SubVec; + if (sd_match(Src, m_InsertSubvector(m_Undef(), m_Value(SubVec), m_Zero()))) + Src = SubVec; + + SDValue SplatVal = DAG.getSplatValue(Src); + if (!SplatVal) + break; + MVT VT = N->getSimpleValueType(0); + return lowerScalarSplat(Passthru, SplatVal, VL, VT, SDLoc(N), DAG, + Subtarget); + } + case RISCVISD::VSLIDEDOWN_VL: + case RISCVISD::VSLIDEUP_VL: + if (N->getOperand(1)->isUndef()) + return N->getOperand(0); + break; case RISCVISD::VSLIDE1UP_VL: case RISCVISD::VFSLIDE1UP_VL: { using namespace SDPatternMatch; @@ -22203,8 +22571,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo(); Register LoReg = MI.getOperand(0).getReg(); Register HiReg = MI.getOperand(1).getReg(); Register SrcReg = MI.getOperand(2).getReg(); @@ -22213,7 +22580,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF); TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC, - RI, Register()); + Register()); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMOLo = MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8)); @@ -22239,8 +22606,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo(); Register DstReg = MI.getOperand(0).getReg(); Register LoReg = MI.getOperand(1).getReg(); Register HiReg = MI.getOperand(2).getReg(); @@ -22263,7 +22629,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, .addFrameIndex(FI) .addImm(4) .addMemOperand(MMOHi); - TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI, Register()); + TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, Register()); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -23957,14 +24323,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, break; } } else if (Constraint == "vr") { + // Check VM first so that mask types will use that instead of VR. for (const auto *RC : - {&RISCV::VRRegClass, &RISCV::VRM2RegClass, &RISCV::VRM4RegClass, - &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, &RISCV::VRN3M1RegClass, - &RISCV::VRN4M1RegClass, &RISCV::VRN5M1RegClass, - &RISCV::VRN6M1RegClass, &RISCV::VRN7M1RegClass, - &RISCV::VRN8M1RegClass, &RISCV::VRN2M2RegClass, - &RISCV::VRN3M2RegClass, &RISCV::VRN4M2RegClass, - &RISCV::VRN2M4RegClass}) { + {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass, + &RISCV::VRM4RegClass, &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, + &RISCV::VRN3M1RegClass, &RISCV::VRN4M1RegClass, + &RISCV::VRN5M1RegClass, &RISCV::VRN6M1RegClass, + &RISCV::VRN7M1RegClass, &RISCV::VRN8M1RegClass, + &RISCV::VRN2M2RegClass, &RISCV::VRN3M2RegClass, + &RISCV::VRN4M2RegClass, &RISCV::VRN2M4RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); @@ -23975,15 +24342,16 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } } } else if (Constraint == "vd") { + // Check VMNoV0 first so that mask types will use that instead of VRNoV0. for (const auto *RC : - {&RISCV::VRNoV0RegClass, &RISCV::VRM2NoV0RegClass, - &RISCV::VRM4NoV0RegClass, &RISCV::VRM8NoV0RegClass, - &RISCV::VRN2M1NoV0RegClass, &RISCV::VRN3M1NoV0RegClass, - &RISCV::VRN4M1NoV0RegClass, &RISCV::VRN5M1NoV0RegClass, - &RISCV::VRN6M1NoV0RegClass, &RISCV::VRN7M1NoV0RegClass, - &RISCV::VRN8M1NoV0RegClass, &RISCV::VRN2M2NoV0RegClass, - &RISCV::VRN3M2NoV0RegClass, &RISCV::VRN4M2NoV0RegClass, - &RISCV::VRN2M4NoV0RegClass}) { + {&RISCV::VMNoV0RegClass, &RISCV::VRNoV0RegClass, + &RISCV::VRM2NoV0RegClass, &RISCV::VRM4NoV0RegClass, + &RISCV::VRM8NoV0RegClass, &RISCV::VRN2M1NoV0RegClass, + &RISCV::VRN3M1NoV0RegClass, &RISCV::VRN4M1NoV0RegClass, + &RISCV::VRN5M1NoV0RegClass, &RISCV::VRN6M1NoV0RegClass, + &RISCV::VRN7M1NoV0RegClass, &RISCV::VRN8M1NoV0RegClass, + &RISCV::VRN2M2NoV0RegClass, &RISCV::VRN3M2NoV0RegClass, + &RISCV::VRN4M2NoV0RegClass, &RISCV::VRN2M4NoV0RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); @@ -25043,6 +25411,22 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return true; } +bool RISCVTargetLowering::isLegalFirstFaultLoad(EVT DataType, + Align Alignment) const { + if (!Subtarget.hasVInstructions()) + return false; + + EVT ScalarType = DataType.getScalarType(); + if (!isLegalElementTypeForRVV(ScalarType)) + return false; + + if (!Subtarget.enableUnalignedVectorMem() && + Alignment < ScalarType.getStoreSize()) + return false; + + return true; +} + MachineInstr * RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, @@ -25191,8 +25575,10 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { // Mark RVV intrinsic as supported. if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) { - // GISel doesn't support tuple types yet. - if (Inst.getType()->isRISCVVectorTupleTy()) + // GISel doesn't support tuple types yet. It also doesn't suport returning + // a struct containing a scalable vector like vleff. + if (Inst.getType()->isRISCVVectorTupleTy() || + Inst.getType()->isStructTy()) return true; for (unsigned i = 0; i < II->arg_size(); ++i) @@ -25201,6 +25587,8 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const { return false; } + if (II->getIntrinsicID() == Intrinsic::vector_extract) + return false; } if (Inst.getType()->isScalableTy()) @@ -25228,7 +25616,7 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return SDValue(N, 0); // Lower SDIV as SDIV // Only perform this transform if short forward branch opt is supported. - if (!Subtarget.hasShortForwardBranchOpt()) + if (!Subtarget.hasShortForwardBranchIALU()) return SDValue(); EVT VT = N->getValueType(0); if (!(VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()))) @@ -25434,3 +25822,17 @@ bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { return VT.getSizeInBits() <= Subtarget.getXLen(); } + +bool RISCVTargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (!N0.hasOneUse()) + return false; + + // Avoid reassociating expressions that can be lowered to vector + // multiply accumulate (i.e. add (mul x, y), z) + if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::MUL && + (N0.getValueType().isVector() && Subtarget.hasVInstructions())) + return false; + + return true; +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index dd62a9c..a7db946 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -35,7 +35,7 @@ public: const RISCVSubtarget &getSubtarget() const { return Subtarget; } - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, @@ -71,6 +71,9 @@ public: bool preferScalarizeSplat(SDNode *N) const override; + /// Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool softPromoteHalfType() const override { return true; } /// Return the register type for a given MVT, ensuring vectors are treated @@ -89,12 +92,6 @@ public: CallingConv::ID CC, EVT VT) const override; - unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, - CallingConv::ID CC, EVT VT, - EVT &IntermediateVT, - unsigned &NumIntermediates, - MVT &RegisterVT) const override; - bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override; @@ -426,6 +423,10 @@ public: /// alignment is legal. bool isLegalStridedLoadStore(EVT DataType, Align Alignment) const; + /// Return true if a fault-only-first load of the given result type and + /// alignment is legal. + bool isLegalFirstFaultLoad(EVT DataType, Align Alignment) const; + unsigned getMaxSupportedInterleaveFactor() const override { return 8; } bool fallBackToDAGISel(const Instruction &Inst) const override; @@ -467,6 +468,11 @@ public: bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + /// Control the following reassociation of operands: (op (op x, c1), y) -> (op + /// (op x, y), c1) where N0 is (op x, c1) and N1 is y. + bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + /// Match a mask which "spreads" the leading elements of a vector evenly /// across the result. Factor is the spread amount, and Index is the /// offset applied. @@ -545,7 +551,6 @@ private: SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPMergeMask(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerVPSplatExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG) const; @@ -573,6 +578,9 @@ private: SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const; SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const; + SDValue expandUnalignedVPLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue expandUnalignedVPStore(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp index 9664ab3..0fc139a 100644 --- a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp +++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp @@ -16,11 +16,12 @@ #include "RISCVInstrInfo.h" #include "RISCVSubtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#define DEBUG_TYPE "riscv-indrect-branch-tracking" +#define DEBUG_TYPE "riscv-indirect-branch-tracking" #define PASS_NAME "RISC-V Indirect Branch Tracking" using namespace llvm; @@ -54,13 +55,25 @@ FunctionPass *llvm::createRISCVIndirectBranchTrackingPass() { return new RISCVIndirectBranchTracking(); } -static void emitLpad(MachineBasicBlock &MBB, const RISCVInstrInfo *TII, - uint32_t Label) { - auto I = MBB.begin(); +static void +emitLpad(MachineBasicBlock &MBB, const RISCVInstrInfo *TII, uint32_t Label, + MachineBasicBlock::iterator I = MachineBasicBlock::iterator{}) { + if (!I.isValid()) + I = MBB.begin(); BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(RISCV::AUIPC), RISCV::X0) .addImm(Label); } +static bool isCallReturnTwice(const MachineOperand &MOp) { + if (!MOp.isGlobal()) + return false; + auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal()); + if (!CalleeFn) + return false; + AttributeList Attrs = CalleeFn->getAttributes(); + return Attrs.hasFnAttr(Attribute::ReturnsTwice); +} + bool RISCVIndirectBranchTracking::runOnMachineFunction(MachineFunction &MF) { const auto &Subtarget = MF.getSubtarget<RISCVSubtarget>(); const RISCVInstrInfo *TII = Subtarget.getInstrInfo(); @@ -100,5 +113,18 @@ bool RISCVIndirectBranchTracking::runOnMachineFunction(MachineFunction &MF) { } } + // Check for calls to functions with ReturnsTwice attribute and insert + // LPAD after such calls + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (I->isCall() && I->getNumOperands() > 0 && + isCallReturnTwice(I->getOperand(0))) { + auto NextI = std::next(I); + emitLpad(MBB, TII, FixedLabel, NextI); + Changed = true; + } + } + } + return Changed; } diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index bf9de0a..b1ba870 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -519,13 +519,13 @@ class VSETVLIInfo { unsigned AVLImm; }; - enum : uint8_t { + enum class AVLState : uint8_t { Uninitialized, AVLIsReg, AVLIsImm, AVLIsVLMAX, Unknown, // AVL and VTYPE are fully unknown - } State = Uninitialized; + } State = AVLState::Uninitialized; // Fields from VTYPE. RISCVVType::VLMUL VLMul = RISCVVType::LMUL_1; @@ -539,7 +539,7 @@ class VSETVLIInfo { public: VSETVLIInfo() : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), - SEWLMULRatioOnly(false) {} + SEWLMULRatioOnly(false), AltFmt(false), TWiden(0) {} static VSETVLIInfo getUnknown() { VSETVLIInfo Info; @@ -547,27 +547,27 @@ public: return Info; } - bool isValid() const { return State != Uninitialized; } - void setUnknown() { State = Unknown; } - bool isUnknown() const { return State == Unknown; } + bool isValid() const { return State != AVLState::Uninitialized; } + void setUnknown() { State = AVLState::Unknown; } + bool isUnknown() const { return State == AVLState::Unknown; } void setAVLRegDef(const VNInfo *VNInfo, Register AVLReg) { assert(AVLReg.isVirtual()); AVLRegDef.ValNo = VNInfo; AVLRegDef.DefReg = AVLReg; - State = AVLIsReg; + State = AVLState::AVLIsReg; } void setAVLImm(unsigned Imm) { AVLImm = Imm; - State = AVLIsImm; + State = AVLState::AVLIsImm; } - void setAVLVLMAX() { State = AVLIsVLMAX; } + void setAVLVLMAX() { State = AVLState::AVLIsVLMAX; } - bool hasAVLImm() const { return State == AVLIsImm; } - bool hasAVLReg() const { return State == AVLIsReg; } - bool hasAVLVLMAX() const { return State == AVLIsVLMAX; } + bool hasAVLImm() const { return State == AVLState::AVLIsImm; } + bool hasAVLReg() const { return State == AVLState::AVLIsReg; } + bool hasAVLVLMAX() const { return State == AVLState::AVLIsVLMAX; } Register getAVLReg() const { assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual()); return AVLRegDef.DefReg; @@ -607,12 +607,36 @@ public: } } - unsigned getSEW() const { return SEW; } - RISCVVType::VLMUL getVLMUL() const { return VLMul; } - bool getTailAgnostic() const { return TailAgnostic; } - bool getMaskAgnostic() const { return MaskAgnostic; } - bool getAltFmt() const { return AltFmt; } - unsigned getTWiden() const { return TWiden; } + unsigned getSEW() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return SEW; + } + RISCVVType::VLMUL getVLMUL() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return VLMul; + } + bool getTailAgnostic() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return TailAgnostic; + } + bool getMaskAgnostic() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return MaskAgnostic; + } + bool getAltFmt() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return AltFmt; + } + unsigned getTWiden() const { + assert(isValid() && !isUnknown() && + "Can't use VTYPE for uninitialized or unknown"); + return TWiden; + } bool hasNonZeroAVL(const LiveIntervals *LIS) const { if (hasAVLImm()) @@ -837,35 +861,44 @@ public: /// Implement operator<<. /// @{ void print(raw_ostream &OS) const { - OS << "{"; - if (!isValid()) + OS << '{'; + switch (State) { + case AVLState::Uninitialized: OS << "Uninitialized"; - if (isUnknown()) + break; + case AVLState::Unknown: OS << "unknown"; - if (hasAVLReg()) + break; + case AVLState::AVLIsReg: OS << "AVLReg=" << llvm::printReg(getAVLReg()); - if (hasAVLImm()) + break; + case AVLState::AVLIsImm: OS << "AVLImm=" << (unsigned)AVLImm; - if (hasAVLVLMAX()) + break; + case AVLState::AVLIsVLMAX: OS << "AVLVLMAX"; - OS << ", "; + break; + } + if (isValid() && !isUnknown()) { + OS << ", "; + + unsigned LMul; + bool Fractional; + std::tie(LMul, Fractional) = decodeVLMUL(VLMul); + + OS << "VLMul=m"; + if (Fractional) + OS << 'f'; + OS << LMul << ", " + << "SEW=e" << (unsigned)SEW << ", " + << "TailAgnostic=" << (bool)TailAgnostic << ", " + << "MaskAgnostic=" << (bool)MaskAgnostic << ", " + << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", " + << "TWiden=" << (unsigned)TWiden << ", " + << "AltFmt=" << (bool)AltFmt; + } - unsigned LMul; - bool Fractional; - std::tie(LMul, Fractional) = decodeVLMUL(VLMul); - - OS << "VLMul="; - if (Fractional) - OS << "mf"; - else - OS << "m"; - OS << LMul << ", " - << "SEW=e" << (unsigned)SEW << ", " - << "TailAgnostic=" << (bool)TailAgnostic << ", " - << "MaskAgnostic=" << (bool)MaskAgnostic << ", " - << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", " - << "TWiden=" << (unsigned)TWiden << ", " - << "AltFmt=" << (bool)AltFmt << "}"; + OS << '}'; } #endif }; @@ -1755,6 +1788,14 @@ bool RISCVInsertVSETVLI::canMutatePriorConfig( if (!VNI || !PrevVNI || VNI != PrevVNI) return false; } + + // If we define VL and need to move the definition up, check we can extend + // the live interval upwards from MI to PrevMI. + Register VL = MI.getOperand(0).getReg(); + if (VL.isVirtual() && LIS && + LIS->getInterval(VL).overlaps(LIS->getInstructionIndex(PrevMI), + LIS->getInstructionIndex(MI))) + return false; } assert(PrevMI.getOperand(2).isImm() && MI.getOperand(2).isImm()); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b05956b..a3bacfb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -82,8 +82,9 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) - : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), - STI(STI) {} + : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN, + RISCV::ADJCALLSTACKUP), + RegInfo(STI.getHwMode()), STI(STI) {} #define GET_INSTRINFO_HELPERS #include "RISCVGenInstrInfo.inc" @@ -530,6 +531,15 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RISCV::GPRPairRegClass.contains(DstReg, SrcReg)) { + if (STI.isRV32() && STI.hasStdExtZdinx()) { + // On RV32_Zdinx, FMV.D will move a pair of registers to another pair of + // registers, in one instruction. + BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_D_IN32X), DstReg) + .addReg(SrcReg, getRenamableRegState(RenamableSrc)) + .addReg(SrcReg, KillFlag | getRenamableRegState(RenamableSrc)); + return; + } + MCRegister EvenReg = TRI->getSubReg(SrcReg, RISCV::sub_gpr_even); MCRegister OddReg = TRI->getSubReg(SrcReg, RISCV::sub_gpr_odd); // We need to correct the odd register of X0_Pair. @@ -638,7 +648,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); @@ -646,8 +655,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::SW : RISCV::SD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW + : RISCV::SD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::SH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -704,7 +713,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegSpilled += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegSpilled += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, @@ -719,10 +728,12 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void RISCVInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DstReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL = @@ -730,8 +741,8 @@ void RISCVInstrInfo::loadRegFromStackSlot( unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::LW : RISCV::LD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW + : RISCV::LD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::LH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -787,7 +798,7 @@ void RISCVInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegReloaded += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegReloaded += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, @@ -1361,8 +1372,11 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, .addMBB(&DestBB, RISCVII::MO_CALL); RS->enterBasicBlockEnd(MBB); + const TargetRegisterClass *RC = &RISCV::GPRRegClass; + if (STI.hasStdExtZicfilp()) + RC = &RISCV::GPRX7RegClass; Register TmpGPR = - RS->scavengeRegisterBackwards(RISCV::GPRRegClass, MI.getIterator(), + RS->scavengeRegisterBackwards(*RC, MI.getIterator(), /*RestoreAfter=*/false, /*SpAdj=*/0, /*AllowSpill=*/false); if (TmpGPR.isValid()) @@ -1372,20 +1386,23 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // Pick s11(or s1 for rve) because it doesn't make a difference. TmpGPR = STI.hasStdExtE() ? RISCV::X9 : RISCV::X27; + // Force t2 if Zicfilp is on + if (STI.hasStdExtZicfilp()) + TmpGPR = RISCV::X7; int FrameIndex = RVFI->getBranchRelaxationScratchFrameIndex(); if (FrameIndex == -1) report_fatal_error("underestimated function size"); storeRegToStackSlot(MBB, MI, TmpGPR, /*IsKill=*/true, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(std::prev(MI.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); MI.getOperand(1).setMBB(&RestoreBB); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), TmpGPR, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); } @@ -1705,6 +1722,9 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; case RISCV::MUL: return RISCV::PseudoCCMUL; + case RISCV::LUI: return RISCV::PseudoCCLUI; + case RISCV::QC_LI: return RISCV::PseudoCCQC_LI; + case RISCV::QC_E_LI: return RISCV::PseudoCCQC_E_LI; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1807,7 +1827,7 @@ bool RISCVInstrInfo::analyzeSelect(const MachineInstr &MI, Cond.push_back(MI.getOperand(2)); Cond.push_back(MI.getOperand(3)); // We can only fold when we support short forward branch opt. - Optimizable = STI.hasShortForwardBranchOpt(); + Optimizable = STI.hasShortForwardBranchIALU(); return false; } @@ -1817,7 +1837,7 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI, bool PreferFalse) const { assert(MI.getOpcode() == RISCV::PseudoCCMOVGPR && "Unknown select instruction"); - if (!STI.hasShortForwardBranchOpt()) + if (!STI.hasShortForwardBranchIALU()) return nullptr; MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -2836,15 +2856,16 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, MCInstrDesc const &Desc = MI.getDesc(); for (const auto &[Index, Operand] : enumerate(Desc.operands())) { + const MachineOperand &MO = MI.getOperand(Index); unsigned OpType = Operand.OperandType; - if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM && - OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) { - const MachineOperand &MO = MI.getOperand(Index); - if (MO.isReg()) { - ErrInfo = "Expected a non-register operand."; - return false; - } - if (MO.isImm()) { + switch (OpType) { + default: + if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM && + OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) { + if (!MO.isImm()) { + ErrInfo = "Expected an immediate operand."; + return false; + } int64_t Imm = MO.getImm(); bool Ok; switch (OpType) { @@ -2872,7 +2893,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, CASE_OPERAND_UIMM(10) CASE_OPERAND_UIMM(12) CASE_OPERAND_UIMM(16) - CASE_OPERAND_UIMM(20) CASE_OPERAND_UIMM(32) CASE_OPERAND_UIMM(48) CASE_OPERAND_UIMM(64) @@ -2890,7 +2910,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, Ok = isUInt<5>(Imm) && (Imm > 3); break; case RISCVOp::OPERAND_UIMM5_PLUS1: - Ok = (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32); + Ok = Imm >= 1 && Imm <= 32; break; case RISCVOp::OPERAND_UIMM6_LSB0: Ok = isShiftedUInt<5, 1>(Imm); @@ -2913,6 +2933,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM9_LSB000: Ok = isShiftedUInt<6, 3>(Imm); break; + case RISCVOp::OPERAND_SIMM8_UNSIGNED: + Ok = isInt<8>(Imm); + break; case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0); break; @@ -2934,12 +2957,12 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, // clang-format off CASE_OPERAND_SIMM(5) CASE_OPERAND_SIMM(6) + CASE_OPERAND_SIMM(10) CASE_OPERAND_SIMM(11) - CASE_OPERAND_SIMM(12) CASE_OPERAND_SIMM(26) // clang-format on case RISCVOp::OPERAND_SIMM5_PLUS1: - Ok = (isInt<5>(Imm) && Imm != -16) || Imm == 16; + Ok = Imm >= -15 && Imm <= 16; break; case RISCVOp::OPERAND_SIMM5_NONZERO: Ok = isInt<5>(Imm) && (Imm != 0); @@ -2962,9 +2985,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_SIMM20_LI: Ok = isInt<20>(Imm); break; - case RISCVOp::OPERAND_BARE_SIMM32: - Ok = isInt<32>(Imm); - break; case RISCVOp::OPERAND_UIMMLOG2XLEN: Ok = STI.is64Bit() ? isUInt<6>(Imm) : isUInt<5>(Imm); break; @@ -2973,8 +2993,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, Ok = Ok && Imm != 0; break; case RISCVOp::OPERAND_CLUI_IMM: - Ok = (isUInt<5>(Imm) && Imm != 0) || - (Imm >= 0xfffe0 && Imm <= 0xfffff); + Ok = (isUInt<5>(Imm) && Imm != 0) || (Imm >= 0xfffe0 && Imm <= 0xfffff); break; case RISCVOp::OPERAND_RVKRNUM: Ok = Imm >= 0 && Imm <= 10; @@ -3007,8 +3026,8 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, Ok = Imm >= 0 && Imm < RISCVCC::COND_INVALID; break; case RISCVOp::OPERAND_VEC_POLICY: - Ok = (Imm & - (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) == Imm; + Ok = (Imm & (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) == + Imm; break; case RISCVOp::OPERAND_SEW: Ok = (isUInt<5>(Imm) && RISCVVType::isValidSEW(1 << Imm)); @@ -3032,6 +3051,57 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } } + break; + case RISCVOp::OPERAND_SIMM12_LO: + // TODO: We could be stricter about what non-register operands are + // allowed. + if (MO.isReg()) { + ErrInfo = "Expected a non-register operand."; + return false; + } + if (MO.isImm() && !isInt<12>(MO.getImm())) { + ErrInfo = "Invalid immediate"; + return false; + } + break; + case RISCVOp::OPERAND_UIMM20_LUI: + case RISCVOp::OPERAND_UIMM20_AUIPC: + // TODO: We could be stricter about what non-register operands are + // allowed. + if (MO.isReg()) { + ErrInfo = "Expected a non-register operand."; + return false; + } + if (MO.isImm() && !isUInt<20>(MO.getImm())) { + ErrInfo = "Invalid immediate"; + return false; + } + break; + case RISCVOp::OPERAND_BARE_SIMM32: + // TODO: We could be stricter about what non-register operands are + // allowed. + if (MO.isReg()) { + ErrInfo = "Expected a non-register operand."; + return false; + } + if (MO.isImm() && !isInt<32>(MO.getImm())) { + ErrInfo = "Invalid immediate"; + return false; + } + break; + case RISCVOp::OPERAND_AVL: + if (MO.isImm()) { + int64_t Imm = MO.getImm(); + // VLMAX is represented as -1. + if (!isUInt<5>(Imm) && Imm != -1) { + ErrInfo = "Invalid immediate"; + return false; + } + } else if (!MO.isReg()) { + ErrInfo = "Expected a register or immediate operand."; + return false; + } + break; } } @@ -3045,7 +3115,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, if (Op.isReg() && Op.getReg().isValid()) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); auto *RC = MRI.getRegClass(Op.getReg()); - if (!RISCV::GPRRegClass.hasSubClassEq(RC)) { + if (!RISCV::GPRNoX0RegClass.hasSubClassEq(RC)) { ErrInfo = "Invalid register class for VL operand"; return false; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9..0ffe015 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned { }; class RISCVInstrInfo : public RISCVGenInstrInfo { + const RISCVRegisterInfo RegInfo; public: explicit RISCVInstrInfo(const RISCVSubtarget &STI); + const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; Register isLoadFromStackSlot(const MachineInstr &MI, @@ -113,13 +116,13 @@ public: void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; using TargetInstrInfo::foldMemoryOperandImpl; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 9cb53fb..9a4eb12 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -349,6 +349,7 @@ def simm12_lo : RISCVSImmLeafOp<12> { return isInt<12>(Imm); return MCOp.isBareSymbolRef(); }]; + let OperandType = "OPERAND_SIMM12_LO"; } // A 12-bit signed immediate which cannot fit in 6-bit signed immediate, @@ -394,9 +395,11 @@ class UImm20OperandMaybeSym : RISCVUImmOp<20> { def uimm20_lui : UImm20OperandMaybeSym { let ParserMatchClass = UImmAsmOperand<20, "LUI">; + let OperandType = "OPERAND_UIMM20_LUI"; } def uimm20_auipc : UImm20OperandMaybeSym { let ParserMatchClass = UImmAsmOperand<20, "AUIPC">; + let OperandType = "OPERAND_UIMM20_AUIPC"; } def uimm20 : RISCVUImmOp<20>; @@ -507,7 +510,7 @@ def ixlenimm_li_restricted : Operand<XLenVT> { // A 12-bit signed immediate plus one where the imm range will be -2047~2048. def simm12_plus1 : ImmLeaf<XLenVT, - [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>; + [{return Imm >= -2047 && Imm <= 2048;}]>; // A 6-bit constant greater than 32. def uimm6gt32 : ImmLeaf<XLenVT, [{ @@ -768,7 +771,7 @@ def BGE : BranchCC_rri<0b101, "bge">; def BLTU : BranchCC_rri<0b110, "bltu">; def BGEU : BranchCC_rri<0b111, "bgeu">; -let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in { +let IsSignExtendingOpW = 1, canFoldAsLoad = 1, isReMaterializable = 1 in { def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>; def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>; def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>; @@ -889,7 +892,7 @@ def CSRRCI : CSR_ii<0b111, "csrrci">; /// RV64I instructions let Predicates = [IsRV64] in { -let canFoldAsLoad = 1 in { +let canFoldAsLoad = 1, isReMaterializable = 1 in { def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>; def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 4ffe3e6..deacd41 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -71,7 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt]; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index b30f8ec..bd19100 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -330,7 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 4cbbba3..bba9f96 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,15 +18,22 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmOp<10>; +def simm10 : RISCVSImmOp<10>, ImmLeaf<XLenVT, "return isInt<10>(Imm);">; def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { let RenderMethod = "addSImm8UnsignedOperands"; } +// (<2 x i16>, <2 x i16>) PPACK_DH (<4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>) +def SDT_RISCVPPackDH + : SDTypeProfile<2, 4, [SDTCisVT<0, v2i16>, SDTCisSameAs<0, 1>, + SDTCisVT<2, v4i8>, SDTCisSameAs<0, 3>, + SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>]>; +def riscv_ppack_dh : RVSDNode<"PPACK_DH", SDT_RISCVPPackDH>; + // A 8-bit signed immediate allowing range [-128, 255] // but represented as [-128, 127]. -def simm8_unsigned : RISCVOp { +def simm8_unsigned : RISCVOp, ImmLeaf<XLenVT, "return isInt<8>(Imm);"> { let ParserMatchClass = SImm8UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; let DecoderMethod = "decodeSImmOperand<8>"; @@ -625,8 +632,8 @@ let Predicates = [HasStdExtP] in { def PSUB_H : RVPBinary_rr<0b1000, 0b00, 0b000, "psub.h">; def PSUB_B : RVPBinary_rr<0b1000, 0b10, 0b000, "psub.b">; - def PDIF_H : RVPBinary_rr<0b1001, 0b00, 0b000, "pdif.h">; - def PDIF_B : RVPBinary_rr<0b1001, 0b10, 0b000, "pdif.b">; + def PABD_H : RVPBinary_rr<0b1001, 0b00, 0b000, "pabd.h">; + def PABD_B : RVPBinary_rr<0b1001, 0b10, 0b000, "pabd.b">; def PSSUB_H : RVPBinary_rr<0b1010, 0b00, 0b000, "pssub.h">; def PSSUB_B : RVPBinary_rr<0b1010, 0b10, 0b000, "pssub.b">; @@ -634,8 +641,8 @@ let Predicates = [HasStdExtP] in { def PASUB_H : RVPBinary_rr<0b1011, 0b00, 0b000, "pasub.h">; def PASUB_B : RVPBinary_rr<0b1011, 0b10, 0b000, "pasub.b">; - def PDIFU_H : RVPBinary_rr<0b1101, 0b00, 0b000, "pdifu.h">; - def PDIFU_B : RVPBinary_rr<0b1101, 0b10, 0b000, "pdifu.b">; + def PABDU_H : RVPBinary_rr<0b1101, 0b00, 0b000, "pabdu.h">; + def PABDU_B : RVPBinary_rr<0b1101, 0b10, 0b000, "pabdu.b">; def PSSUBU_H : RVPBinary_rr<0b1110, 0b00, 0b000, "pssubu.h">; def PSSUBU_B : RVPBinary_rr<0b1110, 0b10, 0b000, "pssubu.b">; @@ -693,9 +700,9 @@ let Predicates = [HasStdExtP] in { def SRX : RVPTernary_rrr<0b0101, 0b11, 0b001, "srx">; def PMULU_H_B01 : RVPBinary_rr<0b0110, 0b00, 0b001, "pmulu.h.b01">; - def PDIFSUMU_B : RVPBinary_rr<0b0110, 0b10, 0b001, "pdifsumu.b">; + def PABDSUMU_B : RVPBinary_rr<0b0110, 0b10, 0b001, "pabdsumu.b">; - def PDIFSUMAU_B : RVPTernary_rrr<0b0111, 0b10, 0b001, "pdifsumau.b">; + def PABDSUMAU_B : RVPTernary_rrr<0b0111, 0b10, 0b001, "pabdsumau.b">; } // Predicates = [HasStdExtP] let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { def MUL_H01 : RVPBinary_rr<0b0010, 0b01, 0b001, "mul.h01">; @@ -825,32 +832,25 @@ let Predicates = [HasStdExtP, IsRV64] in { // Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27. // Here we include the 0 in the f field to reduce number of tablegen classes. let Predicates = [HasStdExtP] in { - def PPACK_H : RVPBinary_rr<0b0000, 0b00, 0b100, "ppack.h">; + def PPAIRE_B : RVPBinary_rr<0b0000, 0b00, 0b100, "ppaire.b">; - def PPACKBT_H : RVPBinary_rr<0b0010, 0b00, 0b100, "ppackbt.h">; + def PPAIREO_B : RVPBinary_rr<0b0010, 0b00, 0b100, "ppaireo.b">; + def PPAIREO_H : RVPBinary_rr<0b0010, 0b01, 0b100, "ppaireo.h">; - def PPACKTB_H : RVPBinary_rr<0b0100, 0b00, 0b100, "ppacktb.h">; + def PPAIROE_B : RVPBinary_rr<0b0100, 0b00, 0b100, "ppairoe.b">; + def PPAIROE_H : RVPBinary_rr<0b0100, 0b01, 0b100, "ppairoe.h">; - def PPACKT_H : RVPBinary_rr<0b0110, 0b00, 0b100, "ppackt.h">; + def PPAIRO_B : RVPBinary_rr<0b0110, 0b00, 0b100, "ppairo.b">; + def PPAIRO_H : RVPBinary_rr<0b0110, 0b01, 0b100, "ppairo.h">; } // Predicates = [HasStdExtP] -let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { - def PACKBT_RV32 : RVPBinary_rr<0b0010, 0b01, 0b100, "packbt">; - - def PACKTB_RV32 : RVPBinary_rr<0b0100, 0b01, 0b100, "packtb">; - - def PACKT_RV32 : RVPBinary_rr<0b0110, 0b01, 0b100, "packt">; -} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" let Predicates = [HasStdExtP, IsRV64] in { - def PPACK_W : RVPBinary_rr<0b0000, 0b01, 0b100, "ppack.w">; + def PPAIRE_H : RVPBinary_rr<0b0000, 0b01, 0b100, "ppaire.h">; - def PPACKBT_W : RVPBinary_rr<0b0010, 0b01, 0b100, "ppackbt.w">; - def PACKBT_RV64 : RVPBinary_rr<0b0010, 0b11, 0b100, "packbt">; + def PPAIREO_W : RVPBinary_rr<0b0010, 0b11, 0b100, "ppaireo.w">; - def PPACKTB_W : RVPBinary_rr<0b0100, 0b01, 0b100, "ppacktb.w">; - def PACKTB_RV64 : RVPBinary_rr<0b0100, 0b11, 0b100, "packtb">; + def PPAIROE_W : RVPBinary_rr<0b0100, 0b11, 0b100, "ppairoe.w">; - def PPACKT_W : RVPBinary_rr<0b0110, 0b01, 0b100, "ppackt.w">; - def PACKT_RV64 : RVPBinary_rr<0b0110, 0b11, 0b100, "packt">; + def PPAIRO_W : RVPBinary_rr<0b0110, 0b11, 0b100, "ppairo.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { @@ -1378,8 +1378,8 @@ let Predicates = [HasStdExtP, IsRV32] in { def PSUB_DB : RVPPairBinary_rr<0b1000, 0b10, "psub.db">; def SUBD : RVPPairBinary_rr<0b1000, 0b11, "subd">; - def PDIF_DH : RVPPairBinary_rr<0b1001, 0b00, "pdif.dh">; - def PDIF_DB : RVPPairBinary_rr<0b1001, 0b10, "pdif.db">; + def PABD_DH : RVPPairBinary_rr<0b1001, 0b00, "pabd.dh">; + def PABD_DB : RVPPairBinary_rr<0b1001, 0b10, "pabd.db">; def PSSUB_DH : RVPPairBinary_rr<0b1010, 0b00, "pssub.dh">; def PSSUB_DW : RVPPairBinary_rr<0b1010, 0b01, "pssub.dw">; @@ -1389,8 +1389,8 @@ let Predicates = [HasStdExtP, IsRV32] in { def PASUB_DW : RVPPairBinary_rr<0b1011, 0b01, "pasub.dw">; def PASUB_DB : RVPPairBinary_rr<0b1011, 0b10, "pasub.db">; - def PDIFU_DH : RVPPairBinary_rr<0b1101, 0b00, "pdifu.dh">; - def PDIFU_DB : RVPPairBinary_rr<0b1101, 0b10, "pdifu.db">; + def PABDU_DH : RVPPairBinary_rr<0b1101, 0b00, "pabdu.dh">; + def PABDU_DB : RVPPairBinary_rr<0b1101, 0b10, "pabdu.db">; def PSSUBU_DH : RVPPairBinary_rr<0b1110, 0b00, "pssubu.dh">; def PSSUBU_DW : RVPPairBinary_rr<0b1110, 0b01, "pssubu.dw">; @@ -1406,17 +1406,17 @@ let Predicates = [HasStdExtP, IsRV32] in { def PSSH1SADD_DH : RVPPairBinaryShift_rr<0b011, 0b00, "pssh1sadd.dh">; def PSSH1SADD_DW : RVPPairBinaryShift_rr<0b011, 0b01, "pssh1sadd.dw">; - def PPACK_DH : RVPPairBinaryPack_rr<0b000, 0b00, "ppack.dh">; - def PPACK_DW : RVPPairBinaryPack_rr<0b000, 0b01, "ppack.dw">; + def PPAIRE_DB : RVPPairBinaryPack_rr<0b000, 0b00, "ppaire.db">; + def PPAIRE_DH : RVPPairBinaryPack_rr<0b000, 0b01, "ppaire.dh">; - def PPACKBT_DH : RVPPairBinaryPack_rr<0b001, 0b00, "ppackbt.dh">; - def PPACKBT_DW : RVPPairBinaryPack_rr<0b001, 0b01, "ppackbt.dw">; + def PPAIREO_DB : RVPPairBinaryPack_rr<0b001, 0b00, "ppaireo.db">; + def PPAIREO_DH : RVPPairBinaryPack_rr<0b001, 0b01, "ppaireo.dh">; - def PPACKTB_DH : RVPPairBinaryPack_rr<0b010, 0b00, "ppacktb.dh">; - def PPACKTB_DW : RVPPairBinaryPack_rr<0b010, 0b01, "ppacktb.dw">; + def PPAIROE_DB : RVPPairBinaryPack_rr<0b010, 0b00, "ppairoe.db">; + def PPAIROE_DH : RVPPairBinaryPack_rr<0b010, 0b01, "ppairoe.dh">; - def PPACKT_DH : RVPPairBinaryPack_rr<0b011, 0b00, "ppackt.dh">; - def PPACKT_DW : RVPPairBinaryPack_rr<0b011, 0b01, "ppackt.dw">; + def PPAIRO_DB : RVPPairBinaryPack_rr<0b011, 0b00, "ppairo.db">; + def PPAIRO_DH : RVPPairBinaryPack_rr<0b011, 0b01, "ppairo.dh">; def PAS_DHX : RVPPairBinaryExchanged_rr<0b0000, 0b00, "pas.dhx">; def PSA_DHX : RVPPairBinaryExchanged_rr<0b0000, 0b10, "psa.dhx">; @@ -1461,10 +1461,170 @@ let Predicates = [HasStdExtP, IsRV32] in { // Codegen patterns //===----------------------------------------------------------------------===// -def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; +def riscv_absw : RVSDNode<"ABSW", SDT_RISCVIntUnaryOpW>; -let Predicates = [HasStdExtP] in -def : PatGpr<abs, ABS>; +def SDT_RISCVPASUB : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisInt<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>; +def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUB>; -let Predicates = [HasStdExtP, IsRV64] in -def : PatGpr<riscv_absw, ABSW>; +let Predicates = [HasStdExtP] in { + def : PatGpr<abs, ABS>; + + // Basic 8-bit arithmetic patterns + def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>; + + // Basic 16-bit arithmetic patterns + def: Pat<(XLenVecI16VT (add GPR:$rs1, GPR:$rs2)), (PADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit saturating add/sub patterns + def: Pat<(XLenVecI8VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit saturating add/sub patterns + def: Pat<(XLenVecI16VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit averaging patterns + def: Pat<(XLenVecI8VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit averaging patterns + def: Pat<(XLenVecI16VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit absolute difference patterns + def: Pat<(XLenVecI8VT (abds GPR:$rs1, GPR:$rs2)), (PABD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (abdu GPR:$rs1, GPR:$rs2)), (PABDU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit absolute difference patterns + def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PABD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PABDU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit logical shift left patterns + def: Pat<(XLenVecI8VT (shl GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))), + (PSLLI_B GPR:$rs1, uimm3:$shamt)>; + + // 16-bit logical shift left patterns + def: Pat<(XLenVecI16VT (shl GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), + (PSLLI_H GPR:$rs1, uimm4:$shamt)>; + + // 16-bit signed saturation shift left patterns + def: Pat<(XLenVecI16VT (sshlsat GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), + (PSSLAI_H GPR:$rs1, uimm4:$shamt)>; + + // 8-bit logical shift left + def: Pat<(XLenVecI8VT (shl GPR:$rs1, + (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))), + (PSLL_BS GPR:$rs1, GPR:$rs2)>; + // 16-bit logical shift left + def: Pat<(XLenVecI16VT (shl GPR:$rs1, + (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))), + (PSLL_HS GPR:$rs1, GPR:$rs2)>; + + // 8-bit PLI SD node pattern + def: Pat<(XLenVecI8VT (splat_vector simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + // 16-bit PLI SD node pattern + def: Pat<(XLenVecI16VT (splat_vector simm10:$imm10)), (PLI_H simm10:$imm10)>; + + // // splat pattern + def: Pat<(XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))), (PADD_BS (XLenVT X0), GPR:$rs2)>; + def: Pat<(XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))), (PADD_HS (XLenVT X0), GPR:$rs2)>; +} // Predicates = [HasStdExtP] + +let Predicates = [HasStdExtP, IsRV32] in { + // Load/Store patterns + def : StPat<store, SW, GPR, v4i8>; + def : StPat<store, SW, GPR, v2i16>; + def : LdPat<load, LW, v4i8>; + def : LdPat<load, LW, v2i16>; + + // Build vector patterns + def : Pat<(v2i16 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b))), + (PACK GPR:$a, GPR:$b)>; +} // Predicates = [HasStdExtP, IsRV32] + +let Predicates = [HasStdExtP, IsRV64] in { + def : PatGpr<riscv_absw, ABSW>; + + // 32-bit PLI SD node pattern + def: Pat<(v2i32 (splat_vector simm10:$imm10)), (PLI_W simm10:$imm10)>; + + // Basic 32-bit arithmetic patterns + def: Pat<(v2i32 (add GPR:$rs1, GPR:$rs2)), (PADD_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (sub GPR:$rs1, GPR:$rs2)), (PSUB_W GPR:$rs1, GPR:$rs2)>; + + // 32-bit saturating add/sub patterns + def: Pat<(v2i32 (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_W GPR:$rs1, GPR:$rs2)>; + + // 32-bit averaging patterns + def: Pat<(v2i32 (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_W GPR:$rs1, GPR:$rs2)>; + + // 32-bit averaging-sub patterns + def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>; + + // 32-bit logical shift left + def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))), + (PSLL_WS GPR:$rs1, GPR:$rs2)>; + + // splat pattern + def: Pat<(v2i32 (splat_vector (XLenVT GPR:$rs2))), (PADD_WS (XLenVT X0), GPR:$rs2)>; + + // 32-bit logical shift left patterns + def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), + (PSLLI_W GPR:$rs1, uimm5:$shamt)>; + + // 32-bit signed saturation shift left patterns + def: Pat<(v2i32 (sshlsat GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), + (PSSLAI_W GPR:$rs1, uimm5:$shamt)>; + + // Load/Store patterns + def : StPat<store, SD, GPR, v8i8>; + def : StPat<store, SD, GPR, v4i16>; + def : StPat<store, SD, GPR, v2i32>; + def : LdPat<load, LD, v8i8>; + def : LdPat<load, LD, v4i16>; + def : LdPat<load, LD, v2i32>; + + // Build vector patterns + def : Pat<(v8i8 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b), + (XLenVT GPR:$c), (XLenVT GPR:$d), + (XLenVT undef), (XLenVT undef), + (XLenVT undef), (XLenVT undef))), + (PPAIRE_H (PPAIRE_B GPR:$a, GPR:$b), (PPAIRE_B GPR:$c, GPR:$d))>; + + def : Pat<(v8i8 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b), + (XLenVT GPR:$c), (XLenVT GPR:$d), + (XLenVT GPR:$e), (XLenVT GPR:$f), + (XLenVT GPR:$g), (XLenVT GPR:$h))), + (PACK (PPAIRE_H (PPAIRE_B GPR:$a, GPR:$b), (PPAIRE_B GPR:$c, GPR:$d)), + (PPAIRE_H (PPAIRE_B GPR:$e, GPR:$f), (PPAIRE_B GPR:$g, GPR:$h)))>; + + def : Pat<(v4i16 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b), + (XLenVT undef), (XLenVT undef))), + (PPAIRE_H GPR:$a, GPR:$b)>; + + def : Pat<(v4i16 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b), + (XLenVT GPR:$c), (XLenVT GPR:$d))), + (PACK (PPAIRE_H GPR:$a, GPR:$b), (PPAIRE_H GPR:$c, GPR:$d))>; + + def : Pat<(v2i32 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b))), + (PACK GPR:$a, GPR:$b)>; +} // Predicates = [HasStdExtP, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 494b1c9..6563cc2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -let Predicates = [HasShortForwardBranchOpt], isSelect = 1, +let Predicates = [HasShortForwardBranchIALU], isSelect = 1, Constraints = "$dst = $falsev", isCommutable = 1, Size = 8 in { // This instruction moves $truev to $dst when the condition is true. It will // be expanded to control flow in RISCVExpandPseudoInsts. @@ -28,7 +28,7 @@ def PseudoCCMOVGPR : Pseudo<(outs GPR:$dst), // This should always expand to a branch+c.mv so the size is 6 or 4 if the // branch is compressible. -let Predicates = [HasConditionalMoveFusion, NoShortForwardBranchOpt], +let Predicates = [HasConditionalMoveFusion, NoShortForwardBranch], Constraints = "$dst = $falsev", isCommutable = 1, Size = 6 in { // This instruction moves $truev to $dst when the condition is true. It will // be expanded to control flow in RISCVExpandPseudoInsts. @@ -69,6 +69,17 @@ class SFBALU_ri let Constraints = "$dst = $falsev"; } +class SFBLUI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + uimm20_lui:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + class SFBShift_ri : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, @@ -97,7 +108,7 @@ class SFBShiftW_ri // is true. Returns $falsev otherwise. Selected by optimizeSelect. // TODO: Can we use DefaultOperands on the regular binop to accomplish this more // like how ARM does predication? -let Predicates = [HasShortForwardBranchOpt] in { +let Predicates = [HasShortForwardBranchIALU] in { def PseudoCCADD : SFBALU_rr; def PseudoCCSUB : SFBALU_rr; def PseudoCCSLL : SFBALU_rr; @@ -106,17 +117,14 @@ def PseudoCCSRA : SFBALU_rr; def PseudoCCAND : SFBALU_rr; def PseudoCCOR : SFBALU_rr; def PseudoCCXOR : SFBALU_rr; -def PseudoCCMAX : SFBALU_rr; -def PseudoCCMIN : SFBALU_rr; -def PseudoCCMAXU : SFBALU_rr; -def PseudoCCMINU : SFBALU_rr; -def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; def PseudoCCORI : SFBALU_ri; def PseudoCCXORI : SFBALU_ri; +def PseudoCCLUI : SFBLUI; + def PseudoCCSLLI : SFBShift_ri; def PseudoCCSRLI : SFBShift_ri; def PseudoCCSRAI : SFBShift_ri; @@ -140,11 +148,21 @@ def PseudoCCORN : SFBALU_rr; def PseudoCCXNOR : SFBALU_rr; } -let Predicates = [HasShortForwardBranchOpt] in +let Predicates = [HasShortForwardBranchIALU] in def : Pat<(XLenVT (abs GPR:$rs1)), (PseudoCCSUB (XLenVT GPR:$rs1), (XLenVT X0), /* COND_LT */ 2, (XLenVT GPR:$rs1), (XLenVT X0), (XLenVT GPR:$rs1))>; -let Predicates = [HasShortForwardBranchOpt, IsRV64] in +let Predicates = [HasShortForwardBranchIALU, IsRV64] in def : Pat<(sext_inreg (abs 33signbits_node:$rs1), i32), (PseudoCCSUBW (i64 GPR:$rs1), (i64 X0), /* COND_LT */ 2, (i64 GPR:$rs1), (i64 X0), (i64 GPR:$rs1))>; + +let Predicates = [HasShortForwardBranchIMinMax] in { +def PseudoCCMAX : SFBALU_rr; +def PseudoCCMIN : SFBALU_rr; +def PseudoCCMAXU : SFBALU_rr; +def PseudoCCMINU : SFBALU_rr; +} + +let Predicates = [HasShortForwardBranchIMul] in +def PseudoCCMUL : SFBALU_rr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index f46455a..594a75a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -79,19 +79,19 @@ def simm5 : RISCVSImmLeafOp<5> { } def simm5_plus1 : RISCVOp, ImmLeaf<XLenVT, - [{return (isInt<5>(Imm) && Imm != -16) || Imm == 16;}]> { + [{return Imm >= -15 && Imm <= 16;}]> { let ParserMatchClass = SImmAsmOperand<5, "Plus1">; let OperandType = "OPERAND_SIMM5_PLUS1"; let MCOperandPredicate = [{ int64_t Imm; if (MCOp.evaluateAsConstantImm(Imm)) - return (isInt<5>(Imm) && Imm != -16) || Imm == 16; + return Imm >= -15 && Imm <= 16; return MCOp.isBareSymbolRef(); }]; } def simm5_plus1_nonzero : ImmLeaf<XLenVT, - [{return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16);}]>; + [{return Imm != 0 && Imm >= -15 && Imm <= 16;}]>; //===----------------------------------------------------------------------===// // Scheduling definitions. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index eb3c9b0..e36204c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2982,21 +2982,21 @@ multiclass VPseudoVFWALU_WV_WF_RM { multiclass VPseudoVMRG_VM_XM_IM { foreach m = MxList in { defvar mx = m.MX; - def "_VVM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, - m.vrclass, m.vrclass, m>, - SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx, - forcePassthruRead=true>; - def "_VXM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, - m.vrclass, GPR, m>, - SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx, - forcePassthruRead=true>; - def "_VIM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, - m.vrclass, simm5, m>, - SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx, - forcePassthruRead=true>; + def "_VVM"#"_"#m.MX : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, + GetVRegNoV0<m.vrclass>.R, + GetVRegNoV0<m.vrclass>.R, m>, + SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx, + forcePassthruRead = true>; + def "_VXM"#"_"#m.MX + : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, + GetVRegNoV0<m.vrclass>.R, GPR, m>, + SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx, + forcePassthruRead = true>; + def "_VIM"#"_"#m.MX + : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, + GetVRegNoV0<m.vrclass>.R, simm5, m>, + SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx, + forcePassthruRead = true>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 139ff92..a67112b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -215,8 +215,8 @@ multiclass VPatBinaryFPSDNode_VV_VF<SDPatternOperator vop, string instruction_na } multiclass VPatBinaryFPSDNode_VV_VF_RM<SDPatternOperator vop, string instruction_name, - bit isSEWAware = 0> { - foreach vti = AllFloatVectors in { + bit isSEWAware = 0, bit isBF16 = 0> { + foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { let Predicates = GetVTypePredicates<vti>.Predicates in { def : VPatBinarySDNode_VV_RM<vop, instruction_name, vti.Vector, vti.Vector, vti.Log2SEW, @@ -246,8 +246,8 @@ multiclass VPatBinaryFPSDNode_R_VF<SDPatternOperator vop, string instruction_nam } multiclass VPatBinaryFPSDNode_R_VF_RM<SDPatternOperator vop, string instruction_name, - bit isSEWAware = 0> { - foreach fvti = AllFloatVectors in + bit isSEWAware = 0, bit isBF16 = 0> { + foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in let Predicates = GetVTypePredicates<fvti>.Predicates in def : Pat<(fvti.Vector (vop (fvti.Vector (SplatFPOp fvti.Scalar:$rs2)), (fvti.Vector fvti.RegClass:$rs1))), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index cf904ea..38edab5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -1058,8 +1058,8 @@ multiclass VPatBinaryFPVL_VV_VF<SDPatternOperator vop, string instruction_name, } multiclass VPatBinaryFPVL_VV_VF_RM<SDPatternOperator vop, string instruction_name, - bit isSEWAware = 0> { - foreach vti = AllFloatVectors in { + bit isSEWAware = 0, bit isBF16 = 0> { + foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { let Predicates = GetVTypePredicates<vti>.Predicates in { def : VPatBinaryVL_V_RM<vop, instruction_name, "VV", vti.Vector, vti.Vector, vti.Vector, vti.Mask, @@ -1093,8 +1093,8 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name, } multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name, - bit isSEWAware = 0> { - foreach fvti = AllFloatVectors in { + bit isSEWAware = 0, bit isBF16 = 0> { + foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { let Predicates = GetVTypePredicates<fvti>.Predicates in def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2), fvti.RegClass:$rs1, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index b683e89..80aded3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -838,7 +838,6 @@ def : Pat<(fpextend (bf16 FPR16:$rs)), (NDS_FCVT_S_BF16 (bf16 FPR16:$rs))>; def : Pat<(bf16 (fpround FPR32:$rs)), (NDS_FCVT_BF16_S FPR32:$rs)>; -} // Predicates = [HasVendorXAndesBFHCvt] let isCodeGenOnly = 1 in { def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">, @@ -847,7 +846,6 @@ def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>; } -let Predicates = [HasVendorXAndesBFHCvt] in { def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>; def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)), (NDS_FMV_X_BF16 (bf16 FPR16:$src))>; @@ -914,7 +912,7 @@ defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU", // Pseudo-instructions for SFB (Short Forward Branch) //===----------------------------------------------------------------------===// -let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0, +let Predicates = [HasShortForwardBranchIALU], hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in { def PseudoCCNDS_BFOS : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index aa8f1a1..7abc616 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -633,8 +633,9 @@ let Predicates = [HasVendorXCVmem, IsRV32] in { def CV_SW_rr : CVStore_rr<0b011, 0b0010110, "cv.sw">; } -let Predicates = [HasVendorXCVelw, IsRV32], hasSideEffects = 0, +let Predicates = [HasVendorXCVelw, IsRV32], hasSideEffects = 1, mayLoad = 1, mayStore = 0 in { + def PseudoCV_ELW : PseudoLoad<"cv.elw">; // Event load def CV_ELW : CVLoad_ri<0b011, "cv.elw">; } @@ -706,6 +707,12 @@ let Predicates = [HasVendorXCVmem, IsRV32], AddedComplexity = 1 in { def : CVStrrPat<store, CV_SW_rr>; } +let Predicates = [HasVendorXCVelw, IsRV32] in { + def : Pat<(int_riscv_cv_elw_elw (XLenVT GPR:$rs1)), (PseudoCV_ELW GPR:$rs1)>; + def : Pat<(int_riscv_cv_elw_elw (AddrRegImm (XLenVT GPR:$rs1), simm12_lo:$imm12)), + (CV_ELW GPR:$rs1, simm12_lo:$imm12)>; +} + multiclass PatCoreVBitManip<Intrinsic intr> { def : PatGprGpr<intr, !cast<RVInst>("CV_" # NAME # "R")>; def : Pat<(intr GPR:$rs1, cv_uimm10:$imm), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index f7b4914..c07ed85 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -935,7 +935,7 @@ let Predicates = [HasVendorXSfcease] in { let rd = 0b00000; let rs1 = 0b00000; let rs2 = 0b00101; -} + } } let Predicates = [HasVendorXSfvfbfexp16e] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td index d77a44a..445e513 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td @@ -13,7 +13,6 @@ def XSfmmVTypeAsmOperand : AsmOperandClass { let Name = "XSfmmVType"; let ParserMethod = "parseXSfmmVType"; - let DiagnosticType = "InvalidXSfmmVType"; let RenderMethod = "addVTypeIOperands"; } @@ -279,7 +278,7 @@ let Uses = [FRM], mayRaiseFPException = true in { } // DecoderNamespace = "XSfvector" class VPseudoSF_VTileLoad - : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew, + : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, GPRNoX0:$atn, ixlenimm:$sew, ixlenimm:$twiden)> { let mayLoad = 1; let mayStore = 0; @@ -290,7 +289,7 @@ class VPseudoSF_VTileLoad } class VPseudoSF_VTileStore - : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew, + : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, GPRNoX0:$atn, ixlenimm:$sew, ixlenimm:$twiden)> { let mayLoad = 0; let mayStore = 1; @@ -301,7 +300,7 @@ class VPseudoSF_VTileStore } class VPseudoSF_VTileMove_V_T - : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew, + : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, GPRNoX0:$atn, ixlenimm:$sew, ixlenimm:$twiden)> { let mayLoad = 0; let mayStore = 0; @@ -312,7 +311,7 @@ class VPseudoSF_VTileMove_V_T } class VPseudoSF_VTileMove_T_V - : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew, + : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, GPRNoX0:$atn, ixlenimm:$sew, ixlenimm:$twiden)> { let mayLoad = 0; let mayStore = 0; @@ -324,8 +323,9 @@ class VPseudoSF_VTileMove_T_V class VPseudoSF_MatMul<RegisterClass mtd_class> : RISCVVPseudo<(outs), - (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn, - AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> { + (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, GPRNoX0:$atm, + GPRNoX0:$atn, GPRNoX0:$atk, ixlenimm:$sew, + ixlenimm:$twiden)> { let mayLoad = 0; let mayStore = 0; let HasTmOp = 1; @@ -339,7 +339,7 @@ class VPseudoSF_MatMul<RegisterClass mtd_class> class VPseudoSF_MatMul_FRM<RegisterClass mtd_class> : RISCVVPseudo<(outs), (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm, - AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew, + GPRNoX0:$atm, GPRNoX0:$atn, GPRNoX0:$atk, ixlenimm:$sew, ixlenimm:$twiden), []> { let mayLoad = 0; let mayStore = 0; @@ -414,7 +414,7 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in { let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in def PseudoSF_VTZERO_T : RISCVVPseudo<(outs), - (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew, + (ins TR:$rd, GPRNoX0:$atm, GPRNoX0:$atn, ixlenimm:$sew, ixlenimm:$twiden)>; def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>; } @@ -425,7 +425,7 @@ class VPatXSfmmTileStore<string intrinsic_name, Pat<(!cast<Intrinsic>(intrinsic_name) (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), - (XLenVT AVL:$tn)), + (XLenVT GPRNoX0:$tn)), (!cast<Instruction>(inst_name) (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), @@ -438,7 +438,7 @@ class VPatXSfmmTileMove_T_V<string intrinsic_name, Pat<(!cast<Intrinsic>(intrinsic_name) (XLenVT GPR:$rs1), (reg_type VRM8:$vs2), - (XLenVT AVL:$atn)), + (XLenVT GPRNoX0:$atn)), (!cast<Instruction>(inst_name) (XLenVT GPR:$rs1), (reg_type VRM8:$vs2), @@ -450,7 +450,7 @@ class VPatXSfmmTileMove_V_T<string intrinsic_name, int log2sew> : Pat<(result_type (!cast<Intrinsic>(intrinsic_name) (XLenVT GPR:$rs1), - (XLenVT AVL:$atn))), + (XLenVT GPRNoX0:$atn))), (!cast<Instruction>(inst_name) (XLenVT GPR:$rs1), GPR:$atn, log2sew, 1)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 8376da5..748494f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -62,7 +62,7 @@ def UImm5Plus1AsmOperand : AsmOperandClass { } def uimm5_plus1 : RISCVOp, ImmLeaf<XLenVT, - [{return (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32);}]> { + [{return Imm >= 1 && Imm <= 32;}]> { let ParserMatchClass = UImm5Plus1AsmOperand; let EncoderMethod = "getImmOpValueMinus1"; let DecoderMethod = "decodeUImmPlus1Operand<5>"; @@ -71,12 +71,12 @@ def uimm5_plus1 : RISCVOp, ImmLeaf<XLenVT, int64_t Imm; if (!MCOp.evaluateAsConstantImm(Imm)) return false; - return (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32); + return Imm >= 1 && Imm <= 32; }]; } def uimm5ge6_plus1 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, - [{return (Imm >= 6) && (isUInt<5>(Imm) || (Imm == 32));}]> { + [{return Imm >= 6 && Imm <= 32;}]> { let ParserMatchClass = UImmAsmOperand<5, "GE6Plus1">; let EncoderMethod = "getImmOpValueMinus1"; let DecoderMethod = "decodeUImmPlus1OperandGE<5,6>"; @@ -85,7 +85,7 @@ def uimm5ge6_plus1 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, int64_t Imm; if (!MCOp.evaluateAsConstantImm(Imm)) return false; - return (Imm >= 6) && (isUInt<5>(Imm) || (Imm == 32)); + return Imm >= 6 && Imm <= 32; }]; } @@ -817,6 +817,28 @@ class QCIRVInst48EJ<bits<2> func2, string opcodestr> let Inst{6-0} = 0b0011111; } +class SFBQC_LI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + simm20_li:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + +class SFBQC_E_LI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + bare_simm32:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 10; + let Constraints = "$dst = $falsev"; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1308,6 +1330,11 @@ def PseudoQC_E_SH : PseudoStore<"qc.e.sh">; def PseudoQC_E_SW : PseudoStore<"qc.e.sw">; } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasShortForwardBranchIALU] in { +def PseudoCCQC_LI : SFBQC_LI; +def PseudoCCQC_E_LI : SFBQC_E_LI; +} + //===----------------------------------------------------------------------===// // Code Gen Patterns //===----------------------------------------------------------------------===// @@ -1544,7 +1571,7 @@ def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>; let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; -let Predicates = [HasVendorXqcicm, NoShortForwardBranchOpt, IsRV32] in { +let Predicates = [HasVendorXqcicm, NoShortForwardBranch, IsRV32] in { def : QCIMVCCPat<SETEQ, QC_MVEQ>; def : QCIMVCCPat<SETNE, QC_MVNE>; def : QCIMVCCPat<SETLT, QC_MVLT>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 5429c2a..3730f55 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -276,10 +276,9 @@ def XNOR : ALU_rr<0b0100000, 0b100, "xnor", Commutable=1>, Sched<[WriteIALU, ReadIALU, ReadIALU]>; } // Predicates = [HasStdExtZbbOrZbkb] -let Predicates = [HasStdExtZbaOrP] in +let Predicates = [HasStdExtZba] in { def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; -let Predicates = [HasStdExtZba] in { def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, @@ -351,32 +350,30 @@ def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[WriteXPERM, ReadXPERM, ReadXPERM]>; } // Predicates = [HasStdExtZbkx] -let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in +let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { def CLZ : Unary_r<0b011000000000, 0b001, "clz">, Sched<[WriteCLZ, ReadCLZ]>; -let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { def CTZ : Unary_r<0b011000000001, 0b001, "ctz">, Sched<[WriteCTZ, ReadCTZ]>; def CPOP : Unary_r<0b011000000010, 0b001, "cpop">, Sched<[WriteCPOP, ReadCPOP]>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrP, IsRV64], IsSignExtendingOpW = 1 in +let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in { def CLZW : UnaryW_r<0b011000000000, 0b001, "clzw">, Sched<[WriteCLZ32, ReadCLZ32]>; -let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in { def CTZW : UnaryW_r<0b011000000001, 0b001, "ctzw">, Sched<[WriteCTZ32, ReadCTZ32]>; def CPOPW : UnaryW_r<0b011000000010, 0b001, "cpopw">, Sched<[WriteCPOP32, ReadCPOP32]>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in { +let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { def SEXT_B : Unary_r<0b011000000100, 0b001, "sext.b">, Sched<[WriteIALU, ReadIALU]>; def SEXT_H : Unary_r<0b011000000101, 0b001, "sext.h">, Sched<[WriteIALU, ReadIALU]>; -} // Predicates = [HasStdExtZbbOrP] +} // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbc] in { def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr", Commutable=1>, @@ -390,7 +387,7 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", Commutable=1>, Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>; } // Predicates = [HasStdExtZbcOrZbkc] -let Predicates = [HasStdExtZbbOrP] in { +let Predicates = [HasStdExtZbb] in { def MIN : ALU_rr<0b0000101, 0b100, "min", Commutable=1>, Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>; def MINU : ALU_rr<0b0000101, 0b101, "minu", Commutable=1>, @@ -399,7 +396,7 @@ def MAX : ALU_rr<0b0000101, 0b110, "max", Commutable=1>, Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>; def MAXU : ALU_rr<0b0000101, 0b111, "maxu", Commutable=1>, Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>; -} // Predicates = [HasStdExtZbbOrP] +} // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbkbOrP] in def PACK : ALU_rr<0b0000100, 0b100, "pack">, @@ -424,15 +421,15 @@ def ZEXT_H_RV64 : RVBUnaryR<0b0000100, 0b100, OPC_OP_32, "zext.h">, Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in { +let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in { def REV8_RV32 : Unary_r<0b011010011000, 0b101, "rev8">, Sched<[WriteREV8, ReadREV8]>; -} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] +} // Predicates = [HasStdExtZbbOrZbkb, IsRV32] -let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in { def REV8_RV64 : Unary_r<0b011010111000, 0b101, "rev8">, Sched<[WriteREV8, ReadREV8]>; -} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] +} // Predicates = [HasStdExtZbbOrZbkb, IsRV64] let Predicates = [HasStdExtZbb] in { def ORC_B : Unary_r<0b001010000111, 0b101, "orc.b">, @@ -599,20 +596,14 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>; def : PatGpr<riscv_unzip, UNZIP_RV32, i32>; } // Predicates = [HasStdExtZbkb, IsRV32] -let Predicates = [HasStdExtZbbOrP] in { -def : PatGpr<ctlz, CLZ>; -} - let Predicates = [HasStdExtZbb] in { +def : PatGpr<ctlz, CLZ>; def : PatGpr<cttz, CTZ>; def : PatGpr<ctpop, CPOP>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrP, IsRV64] in { -def : PatGpr<riscv_clzw, CLZW>; -} - let Predicates = [HasStdExtZbb, IsRV64] in { +def : PatGpr<riscv_clzw, CLZW>; def : PatGpr<riscv_ctzw, CTZW>; def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; @@ -620,22 +611,22 @@ def : Pat<(i64 (riscv_negw_max GPR:$rs1)), (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbbOrP] in { +let Predicates = [HasStdExtZbb] in { def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>; def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrP] in { +let Predicates = [HasStdExtZbb] in { def : PatGprGpr<smin, MIN>; def : PatGprGpr<smax, MAX>; def : PatGprGpr<umin, MINU>; def : PatGprGpr<umax, MAXU>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in +let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in def : PatGpr<bswap, REV8_RV32, i32>; -let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in +let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in def : PatGpr<bswap, REV8_RV64, i64>; let Predicates = [HasStdExtZbkb] in { @@ -652,6 +643,9 @@ def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), zexti8:$rs1), (PACKH zexti8:$rs1, GPR:$rs2)>; + +def : Pat<(shl (and GPR:$rs2, 0xFF), (XLenVT 8)), + (PACKH (XLenVT X0), GPR:$rs2)>; } // Predicates = [HasStdExtZbkb] let Predicates = [HasStdExtZbkb, IsRV32] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index 1c6a5af..c172d17 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,7 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td index a3203f2..4fc859f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td @@ -47,6 +47,23 @@ let Predicates = [HasStdExtZilsd, IsRV32] in { def PseudoLD_RV32 : PseudoLoad<"ld", GPRPairRV32>; def PseudoSD_RV32 : PseudoStore<"sd", GPRPairRV32>; +// Pseudo instructions for load/store optimization with 2 separate registers +def PseudoLD_RV32_OPT : + Pseudo<(outs GPR:$rd1, GPR:$rd2), + (ins GPR:$rs1, simm12_lo:$imm12), [], "", ""> { + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +def PseudoSD_RV32_OPT : + Pseudo<(outs), + (ins GPR:$rs1, GPR:$rs2, GPR:$rs3, simm12_lo:$imm12), [], "", ""> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 1; +} + def : InstAlias<"ld $rd, (${rs1})", (LD_RV32 GPRPairRV32:$rd, GPR:$rs1, 0), 0>; def : InstAlias<"sd $rs2, (${rs1})", (SD_RV32 GPRPairRV32:$rs2, GPR:$rs1, 0), 0>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index ffb2ac0..e24e4a3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -783,4 +783,22 @@ let Predicates = [HasStdExtZvfbfa] in { TAIL_AGNOSTIC)>; } } + + defm : VPatBinaryFPSDNode_VV_VF_RM<any_fadd, "PseudoVFADD_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPSDNode_VV_VF_RM<any_fsub, "PseudoVFSUB_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPSDNode_VV_VF_RM<any_fmul, "PseudoVFMUL_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPSDNode_R_VF_RM<any_fsub, "PseudoVFRSUB_ALT", + isSEWAware=1, isBF16=1>; + + defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fadd_vl, "PseudoVFADD_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fsub_vl, "PseudoVFSUB_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fmul_vl, "PseudoVFMUL_ALT", + isSEWAware=1, isBF16=1>; + defm : VPatBinaryFPVL_R_VF_RM<any_riscv_fsub_vl, "PseudoVFRSUB_ALT", + isSEWAware=1, isBF16=1>; } // Predicates = [HasStdExtZvfbfa] diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp index 115a96e..a22ab6b 100644 --- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp @@ -11,6 +11,9 @@ // paired instruction, leveraging hardware support for paired memory accesses. // Much of the pairing logic is adapted from the AArch64LoadStoreOpt pass. // +// Post-allocation Zilsd decomposition: Fixes invalid LD/SD instructions if +// register allocation didn't provide suitable consecutive registers. +// // NOTE: The AArch64LoadStoreOpt pass performs additional optimizations such as // merging zero store instructions, promoting loads that read directly from a // preceding store, and merging base register updates with load/store @@ -23,6 +26,7 @@ #include "RISCV.h" #include "RISCVTargetMachine.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/Passes.h" #include "llvm/MC/TargetRegistry.h" @@ -38,6 +42,8 @@ using namespace llvm; // pairs. static cl::opt<unsigned> LdStLimit("riscv-load-store-scan-limit", cl::init(128), cl::Hidden); +STATISTIC(NumLD2LW, "Number of LD instructions split back to LW"); +STATISTIC(NumSD2SW, "Number of SD instructions split back to SW"); namespace { @@ -75,6 +81,13 @@ struct RISCVLoadStoreOpt : public MachineFunctionPass { mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, bool MergeForward); + // Post reg-alloc zilsd part + bool fixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + bool isValidZilsdRegPair(Register First, Register Second); + void splitLdSdIntoTwo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, bool IsLoad); + private: AliasAnalysis *AA; MachineRegisterInfo *MRI; @@ -92,8 +105,6 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; const RISCVSubtarget &Subtarget = Fn.getSubtarget<RISCVSubtarget>(); - if (!Subtarget.useMIPSLoadStorePairs()) - return false; bool MadeChange = false; TII = Subtarget.getInstrInfo(); @@ -103,18 +114,34 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { ModifiedRegUnits.init(*TRI); UsedRegUnits.init(*TRI); - for (MachineBasicBlock &MBB : Fn) { - LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n"); + if (Subtarget.useMIPSLoadStorePairs()) { + for (MachineBasicBlock &MBB : Fn) { + LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n"); + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + if (TII->isPairableLdStInstOpc(MBBI->getOpcode()) && + tryToPairLdStInst(MBBI)) + MadeChange = true; + else + ++MBBI; + } + } + } - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - if (TII->isPairableLdStInstOpc(MBBI->getOpcode()) && - tryToPairLdStInst(MBBI)) - MadeChange = true; - else - ++MBBI; + if (!Subtarget.is64Bit() && Subtarget.hasStdExtZilsd()) { + for (auto &MBB : Fn) { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (fixInvalidRegPairOp(MBB, MBBI)) { + MadeChange = true; + // Iterator was updated by fixInvalidRegPairOp + } else { + ++MBBI; + } + } } } + return MadeChange; } @@ -395,6 +422,187 @@ RISCVLoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, return NextI; } +//===----------------------------------------------------------------------===// +// Post reg-alloc zilsd pass implementation +//===----------------------------------------------------------------------===// + +bool RISCVLoadStoreOpt::isValidZilsdRegPair(Register First, Register Second) { + // Special case: First register can not be zero unless both registers are + // zeros. + // Spec says: LD instructions with destination x0 are processed as any other + // load, but the result is discarded entirely and x1 is not written. If using + // x0 as src of SD, the entire 64-bit operand is zero — i.e., register x1 is + // not accessed. + if (First == RISCV::X0) + return Second == RISCV::X0; + + // Check if registers form a valid even/odd pair for Zilsd + unsigned FirstNum = TRI->getEncodingValue(First); + unsigned SecondNum = TRI->getEncodingValue(Second); + + // Must be consecutive and first must be even + return (FirstNum % 2 == 0) && (SecondNum == FirstNum + 1); +} + +void RISCVLoadStoreOpt::splitLdSdIntoTwo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + bool IsLoad) { + MachineInstr *MI = &*MBBI; + DebugLoc DL = MI->getDebugLoc(); + + const MachineOperand &FirstOp = MI->getOperand(0); + const MachineOperand &SecondOp = MI->getOperand(1); + const MachineOperand &BaseOp = MI->getOperand(2); + Register FirstReg = FirstOp.getReg(); + Register SecondReg = SecondOp.getReg(); + Register BaseReg = BaseOp.getReg(); + + // Handle both immediate and symbolic operands for offset + const MachineOperand &OffsetOp = MI->getOperand(3); + int BaseOffset; + if (OffsetOp.isImm()) + BaseOffset = OffsetOp.getImm(); + else + // For symbolic operands, extract the embedded offset + BaseOffset = OffsetOp.getOffset(); + + unsigned Opc = IsLoad ? RISCV::LW : RISCV::SW; + MachineInstrBuilder MIB1, MIB2; + + // Create two separate instructions + if (IsLoad) { + // It's possible that first register is same as base register, when we split + // it becomes incorrect because base register is overwritten, e.g. + // X10, X13 = PseudoLD_RV32_OPT killed X10, 0 + // => + // X10 = LW X10, 0 + // X13 = LW killed X10, 4 + // we can just switch the order to resolve that: + // X13 = LW X10, 4 + // X10 = LW killed X10, 0 + if (FirstReg == BaseReg) { + MIB2 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(SecondReg, + RegState::Define | getDeadRegState(SecondOp.isDead())) + .addReg(BaseReg); + MIB1 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(FirstReg, + RegState::Define | getDeadRegState(FirstOp.isDead())) + .addReg(BaseReg, getKillRegState(BaseOp.isKill())); + + } else { + MIB1 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(FirstReg, + RegState::Define | getDeadRegState(FirstOp.isDead())) + .addReg(BaseReg); + + MIB2 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(SecondReg, + RegState::Define | getDeadRegState(SecondOp.isDead())) + .addReg(BaseReg, getKillRegState(BaseOp.isKill())); + } + + ++NumLD2LW; + LLVM_DEBUG(dbgs() << "Split LD back to two LW instructions\n"); + } else { + assert( + FirstReg != SecondReg && + "First register and second register is impossible to be same register"); + MIB1 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(FirstReg, getKillRegState(FirstOp.isKill())) + .addReg(BaseReg); + + MIB2 = BuildMI(MBB, MBBI, DL, TII->get(Opc)) + .addReg(SecondReg, getKillRegState(SecondOp.isKill())) + .addReg(BaseReg, getKillRegState(BaseOp.isKill())); + + ++NumSD2SW; + LLVM_DEBUG(dbgs() << "Split SD back to two SW instructions\n"); + } + + // Add offset operands - preserve symbolic references + MIB1.add(OffsetOp); + if (OffsetOp.isImm()) + MIB2.addImm(BaseOffset + 4); + else if (OffsetOp.isGlobal()) + MIB2.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset + 4, + OffsetOp.getTargetFlags()); + else if (OffsetOp.isCPI()) + MIB2.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset + 4, + OffsetOp.getTargetFlags()); + else if (OffsetOp.isBlockAddress()) + MIB2.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset + 4, + OffsetOp.getTargetFlags()); + + // Copy memory operands if the original instruction had them + // FIXME: This is overly conservative; the new instruction accesses 4 bytes, + // not 8. + MIB1.cloneMemRefs(*MI); + MIB2.cloneMemRefs(*MI); + + // Remove the original paired instruction and update iterator + MBBI = MBB.erase(MBBI); +} + +bool RISCVLoadStoreOpt::fixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = &*MBBI; + unsigned Opcode = MI->getOpcode(); + + // Check if this is a Zilsd pseudo that needs fixing + if (Opcode != RISCV::PseudoLD_RV32_OPT && Opcode != RISCV::PseudoSD_RV32_OPT) + return false; + + bool IsLoad = Opcode == RISCV::PseudoLD_RV32_OPT; + + const MachineOperand &FirstOp = MI->getOperand(0); + const MachineOperand &SecondOp = MI->getOperand(1); + Register FirstReg = FirstOp.getReg(); + Register SecondReg = SecondOp.getReg(); + + if (!isValidZilsdRegPair(FirstReg, SecondReg)) { + // Need to split back into two instructions + splitLdSdIntoTwo(MBB, MBBI, IsLoad); + return true; + } + + // Registers are valid, convert to real LD/SD instruction + const MachineOperand &BaseOp = MI->getOperand(2); + Register BaseReg = BaseOp.getReg(); + DebugLoc DL = MI->getDebugLoc(); + // Handle both immediate and symbolic operands for offset + const MachineOperand &OffsetOp = MI->getOperand(3); + + unsigned RealOpc = IsLoad ? RISCV::LD_RV32 : RISCV::SD_RV32; + + // Create register pair from the two individual registers + unsigned RegPair = TRI->getMatchingSuperReg(FirstReg, RISCV::sub_gpr_even, + &RISCV::GPRPairRegClass); + // Create the real LD/SD instruction with register pair + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(RealOpc)); + + if (IsLoad) { + // For LD, the register pair is the destination + MIB.addReg(RegPair, RegState::Define | getDeadRegState(FirstOp.isDead() && + SecondOp.isDead())); + } else { + // For SD, the register pair is the source + MIB.addReg(RegPair, getKillRegState(FirstOp.isKill() && SecondOp.isKill())); + } + + MIB.addReg(BaseReg, getKillRegState(BaseOp.isKill())) + .add(OffsetOp) + .cloneMemRefs(*MI); + + LLVM_DEBUG(dbgs() << "Converted pseudo to real instruction: " << *MIB + << "\n"); + + // Remove the pseudo instruction and update iterator + MBBI = MBB.erase(MBBI); + + return true; +} + // Returns an instance of the Load / Store Optimization pass. FunctionPass *llvm::createRISCVLoadStoreOptPass() { return new RISCVLoadStoreOpt(); diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index 87f0c8f..f3adac8 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -83,13 +83,14 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE, // 3) The offset value in the Global Address or Constant Pool is 0. bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, MachineInstr *&Lo) { - if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC && - Hi.getOpcode() != RISCV::PseudoMovAddr) + auto HiOpc = Hi.getOpcode(); + if (HiOpc != RISCV::LUI && HiOpc != RISCV::AUIPC && + HiOpc != RISCV::PseudoMovAddr) return false; const MachineOperand &HiOp1 = Hi.getOperand(1); unsigned ExpectedFlags = - Hi.getOpcode() == RISCV::AUIPC ? RISCVII::MO_PCREL_HI : RISCVII::MO_HI; + HiOpc == RISCV::AUIPC ? RISCVII::MO_PCREL_HI : RISCVII::MO_HI; if (HiOp1.getTargetFlags() != ExpectedFlags) return false; @@ -97,7 +98,7 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, HiOp1.getOffset() != 0) return false; - if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + if (HiOpc == RISCV::PseudoMovAddr) { // Most of the code should handle it correctly without modification by // setting Lo and Hi both point to PseudoMovAddr Lo = &Hi; @@ -112,13 +113,13 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, } const MachineOperand &LoOp2 = Lo->getOperand(2); - if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) { + if (HiOpc == RISCV::LUI || HiOpc == RISCV::PseudoMovAddr) { if (LoOp2.getTargetFlags() != RISCVII::MO_LO || !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) || LoOp2.getOffset() != 0) return false; } else { - assert(Hi.getOpcode() == RISCV::AUIPC); + assert(HiOpc == RISCV::AUIPC); if (LoOp2.getTargetFlags() != RISCVII::MO_PCREL_LO || LoOp2.getType() != MachineOperand::MO_MCSymbol) return false; @@ -148,7 +149,8 @@ bool RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &Hi, MachineInstr &Lo, // If Hi is an AUIPC, don't fold the offset if it is outside the bounds of // the global object. The object may be within 2GB of the PC, but addresses // outside of the object might not be. - if (Hi.getOpcode() == RISCV::AUIPC && Hi.getOperand(1).isGlobal()) { + auto HiOpc = Hi.getOpcode(); + if (HiOpc == RISCV::AUIPC && Hi.getOperand(1).isGlobal()) { const GlobalValue *GV = Hi.getOperand(1).getGlobal(); Type *Ty = GV->getValueType(); if (!Ty->isSized() || Offset < 0 || @@ -158,12 +160,13 @@ bool RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &Hi, MachineInstr &Lo, // Put the offset back in Hi and the Lo Hi.getOperand(1).setOffset(Offset); - if (Hi.getOpcode() != RISCV::AUIPC) + if (HiOpc != RISCV::AUIPC) Lo.getOperand(2).setOffset(Offset); // Delete the tail instruction. - MRI->constrainRegClass(Lo.getOperand(0).getReg(), - MRI->getRegClass(Tail.getOperand(0).getReg())); - MRI->replaceRegWith(Tail.getOperand(0).getReg(), Lo.getOperand(0).getReg()); + Register LoOp0Reg = Lo.getOperand(0).getReg(); + Register TailOp0Reg = Tail.getOperand(0).getReg(); + MRI->constrainRegClass(LoOp0Reg, MRI->getRegClass(TailOp0Reg)); + MRI->replaceRegWith(TailOp0Reg, LoOp0Reg); Tail.eraseFromParent(); LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n" << " " << Hi << " " << Lo;); @@ -204,8 +207,8 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi, return false; // This can point to an ADDI(W) or a LUI: MachineInstr &OffsetTail = *MRI->getVRegDef(Reg); - if (OffsetTail.getOpcode() == RISCV::ADDI || - OffsetTail.getOpcode() == RISCV::ADDIW) { + auto OffsetTailOpc = OffsetTail.getOpcode(); + if (OffsetTailOpc == RISCV::ADDI || OffsetTailOpc == RISCV::ADDIW) { // The offset value has non zero bits in both %hi and %lo parts. // Detect an ADDI that feeds from a LUI instruction. MachineOperand &AddiImmOp = OffsetTail.getOperand(2); @@ -232,7 +235,7 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi, int64_t Offset = SignExtend64<32>(LuiImmOp.getImm() << 12); Offset += OffLo; // RV32 ignores the upper 32 bits. ADDIW sign extends the result. - if (!ST->is64Bit() || OffsetTail.getOpcode() == RISCV::ADDIW) + if (!ST->is64Bit() || OffsetTailOpc == RISCV::ADDIW) Offset = SignExtend64<32>(Offset); // We can only fold simm32 offsets. if (!isInt<32>(Offset)) @@ -244,7 +247,7 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi, OffsetTail.eraseFromParent(); OffsetLui.eraseFromParent(); return true; - } else if (OffsetTail.getOpcode() == RISCV::LUI) { + } else if (OffsetTailOpc == RISCV::LUI) { // The offset value has all zero bits in the lower 12 bits. Only LUI // exists. LLVM_DEBUG(dbgs() << " Offset Instr: " << OffsetTail); @@ -503,14 +506,15 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, Hi.getOperand(1).setOffset(NewOffset); MachineOperand &ImmOp = Lo.getOperand(2); + auto HiOpc = Hi.getOpcode(); // Expand PseudoMovAddr into LUI - if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + if (HiOpc == RISCV::PseudoMovAddr) { auto *TII = ST->getInstrInfo(); Hi.setDesc(TII->get(RISCV::LUI)); Hi.removeOperand(2); } - if (Hi.getOpcode() != RISCV::AUIPC) + if (HiOpc != RISCV::AUIPC) ImmOp.setOffset(NewOffset); // Update the immediate in the load/store instructions to add the offset. diff --git a/llvm/lib/Target/RISCV/RISCVPassRegistry.def b/llvm/lib/Target/RISCV/RISCVPassRegistry.def new file mode 100644 index 0000000..29ccf2c --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPassRegistry.def @@ -0,0 +1,20 @@ +//===- RISCVPassRegistry.def - Registry of RISC-V passes --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is used as the registry of passes that are part of the RISC-V +// backend. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +FUNCTION_PASS("riscv-codegenprepare", RISCVCodeGenPreparePass(this)) +#undef FUNCTION_PASS diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index e86431f..5becfd2 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -141,7 +141,7 @@ def ROCKET : RISCVTuneProcessorModel<"rocket", RocketModel>; defvar SiFive7TuneFeatures = [TuneSiFive7, TuneNoDefaultUnroll, - TuneShortForwardBranchOpt, + TuneShortForwardBranchIALU, TunePostRAScheduler]; def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series", SiFive7Model, SiFive7TuneFeatures>; @@ -633,6 +633,13 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8", FeatureUnalignedVectorMem]), [TuneNoDefaultUnroll, TuneNLogNVRGather, + TuneOptimizedNF2SegmentLoadStore, + TuneOptimizedNF3SegmentLoadStore, + TuneOptimizedNF4SegmentLoadStore, + TuneOptimizedNF5SegmentLoadStore, + TuneOptimizedNF6SegmentLoadStore, + TuneOptimizedNF7SegmentLoadStore, + TuneOptimizedNF8SegmentLoadStore, TuneOptimizedZeroStrideLoad, TunePostRAScheduler]>; @@ -798,7 +805,7 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25", defvar Andes45TuneFeatures = [TuneAndes45, TuneNoDefaultUnroll, - TuneShortForwardBranchOpt, + TuneShortForwardBranchIALU, TunePostRAScheduler]; def ANDES_45 : RISCVTuneProcessorModel<"andes-45-series", diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 84bb294..d802d19 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -864,6 +864,46 @@ bool RISCVRegisterInfo::getRegAllocationHints( const MachineRegisterInfo *MRI = &MF.getRegInfo(); auto &Subtarget = MF.getSubtarget<RISCVSubtarget>(); + // Handle RegPairEven/RegPairOdd hints for Zilsd register pairs + std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(VirtReg); + unsigned HintType = Hint.first; + Register Partner = Hint.second; + + MCRegister TargetReg; + if (HintType == RISCVRI::RegPairEven || HintType == RISCVRI::RegPairOdd) { + // Check if we want the even or odd register of a consecutive pair + bool WantOdd = (HintType == RISCVRI::RegPairOdd); + + // First priority: Check if partner is already allocated + if (Partner.isVirtual() && VRM && VRM->hasPhys(Partner)) { + MCRegister PartnerPhys = VRM->getPhys(Partner); + // Calculate the exact register we need for consecutive pairing + TargetReg = PartnerPhys.id() + (WantOdd ? 1 : -1); + + // Verify it's valid and available + if (RISCV::GPRRegClass.contains(TargetReg) && + is_contained(Order, TargetReg)) + Hints.push_back(TargetReg.id()); + } + + // Second priority: Try to find consecutive register pairs in the allocation + // order + for (MCPhysReg PhysReg : Order) { + // Don't add the hint if we already added above. + if (TargetReg == PhysReg) + continue; + + unsigned RegNum = getEncodingValue(PhysReg); + // Check if this register matches the even/odd requirement + bool IsOdd = (RegNum % 2 != 0); + + // Don't provide hints that are paired to a reserved register. + MCRegister Paired = PhysReg + (IsOdd ? -1 : 1); + if (WantOdd == IsOdd && !MRI->isReserved(Paired)) + Hints.push_back(PhysReg); + } + } + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); @@ -1005,6 +1045,35 @@ bool RISCVRegisterInfo::getRegAllocationHints( return BaseImplRetVal; } +void RISCVRegisterInfo::updateRegAllocHint(Register Reg, Register NewReg, + MachineFunction &MF) const { + MachineRegisterInfo *MRI = &MF.getRegInfo(); + std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(Reg); + + // Handle RegPairEven/RegPairOdd hints for Zilsd register pairs + if ((Hint.first == RISCVRI::RegPairOdd || + Hint.first == RISCVRI::RegPairEven) && + Hint.second.isVirtual()) { + // If 'Reg' is one of the even/odd register pair and it's now changed + // (e.g. coalesced) into a different register, the other register of the + // pair allocation hint must be updated to reflect the relationship change. + Register Partner = Hint.second; + std::pair<unsigned, Register> PartnerHint = + MRI->getRegAllocationHint(Partner); + + // Make sure partner still points to us + if (PartnerHint.second == Reg) { + // Update partner to point to NewReg instead of Reg + MRI->setRegAllocationHint(Partner, PartnerHint.first, NewReg); + + // If NewReg is virtual, set up the reciprocal hint + // NewReg takes over Reg's role, so it gets the SAME hint type as Reg + if (NewReg.isVirtual()) + MRI->setRegAllocationHint(NewReg, Hint.first, Partner); + } + } +} + Register RISCVRegisterInfo::findVRegWithEncoding(const TargetRegisterClass &RegClass, uint16_t Encoding) const { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 67726db..f29f85e 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -37,6 +37,13 @@ enum : uint8_t { NFShiftMask = 0b111 << NFShift, }; +/// Register allocation hints for Zilsd register pairs +enum { + // Used for Zilsd LD/SD register pairs + RegPairOdd = 1, + RegPairEven = 2, +}; + /// \returns the IsVRegClass for the register class. static inline bool isVRegClass(uint8_t TSFlags) { return (TSFlags & IsVRegClassShiftMask) >> IsVRegClassShift; @@ -143,6 +150,9 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; + void updateRegAllocHint(Register Reg, Register NewReg, + MachineFunction &MF) const override; + Register findVRegWithEncoding(const TargetRegisterClass &RegClass, uint16_t Encoding) const; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 6605a5c..f354793 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -222,6 +222,14 @@ def XLenFVT : ValueTypeByHwMode<[RV64], [f64]>; def XLenPairFVT : ValueTypeByHwMode<[RV32], [f64]>; + +// P extension +def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64], + [v4i8, v8i8]>; +def XLenVecI16VT : ValueTypeByHwMode<[RV32, RV64], + [v2i16, v4i16]>; +def XLenVecI32VT : ValueTypeByHwMode<[RV64], + [v2i32]>; def XLenRI : RegInfoByHwMode< [RV32, RV64], [RegInfo<32,32,32>, RegInfo<64,64,64>]>; @@ -238,7 +246,9 @@ class RISCVRegisterClass<list<ValueType> regTypes, int align, dag regList> } class GPRRegisterClass<dag regList> - : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> { + : RISCVRegisterClass<[XLenVT, XLenFVT, + // P extension packed vector types: + XLenVecI8VT, XLenVecI16VT, XLenVecI32VT], 32, regList> { let RegInfos = XLenRI; } @@ -803,6 +813,7 @@ def VMV0 : VReg<VMaskVTs, (add V0), 1>; // The register class is added for inline assembly for vector mask types. def VM : VReg<VMaskVTs, (add VR), 1>; +def VMNoV0 : VReg<VMaskVTs, (sub VR, V0), 1>; defvar VTupM1N2VTs = [riscv_nxv8i8x2, riscv_nxv4i8x2, riscv_nxv2i8x2, riscv_nxv1i8x2]; defvar VTupM1N3VTs = [riscv_nxv8i8x3, riscv_nxv4i8x3, riscv_nxv2i8x3, riscv_nxv1i8x3]; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 36a2f46..f8a7013 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -562,7 +562,7 @@ multiclass SiFive7WriteResBase<int VLEN, // resource, we do not need to use LMULSEWXXX constructors. However, we do // use the SEW from the name to determine the number of Cycles. - foreach mx = SchedMxList in { + foreach mx = SchedMxListEEW8 in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; @@ -582,10 +582,8 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULWriteResMX<"WriteVSTOX8", [VCQ, VS], mx, IsWorstCase>; } } - // TODO: The MxLists need to be filtered by EEW. We only need to support - // LMUL >= SEW_min/ELEN. Here, the smallest EEW prevents us from having MF8 - // since LMUL >= 16/64. - foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in { + + foreach mx = SchedMxListEEW16 in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; @@ -605,7 +603,7 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULWriteResMX<"WriteVSTOX16", [VCQ, VS], mx, IsWorstCase>; } } - foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in { + foreach mx = SchedMxListEEW32 in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; @@ -625,7 +623,7 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULWriteResMX<"WriteVSTOX32", [VCQ, VS], mx, IsWorstCase>; } } - foreach mx = ["M1", "M2", "M4", "M8"] in { + foreach mx = SchedMxListEEW64 in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 41071b2..1cbb6db 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -104,6 +104,11 @@ class Get461018Latency<string mx> { int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c; } +// Used for: FP FMA operations, complex FP ops +class Get6678Latency<string mx> { + int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c; +} + //===----------------------------------------------------------------------===// class SMX60IsWorstCaseMX<string mx, list<string> MxList> { @@ -120,6 +125,33 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0 defvar SMX60VLEN = 256; defvar SMX60DLEN = !div(SMX60VLEN, 2); +class SMX60GetLMulCycles<string mx> { + int c = !cond( + !eq(mx, "M1") : 1, + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 4, + !eq(mx, "M8") : 8, + !eq(mx, "MF2") : 1, + !eq(mx, "MF4") : 1, + !eq(mx, "MF8") : 1 + ); +} + +class SMX60GetVLMAX<string mx, int sew> { + defvar LMUL = SMX60GetLMulCycles<mx>.c; + int val = !cond( + !eq(mx, "MF2") : !div(!div(SMX60VLEN, 2), sew), + !eq(mx, "MF4") : !div(!div(SMX60VLEN, 4), sew), + !eq(mx, "MF8") : !div(!div(SMX60VLEN, 8), sew), + true: !div(!mul(SMX60VLEN, LMUL), sew) + ); +} + +// Latency for segmented loads and stores are calculated as vl * nf. +class SMX60SegmentedLdStCycles<string mx, int sew, int nf> { + int c = !mul(SMX60GetVLMAX<mx, sew>.val, nf); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -362,23 +394,43 @@ foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; // Unit-stride loads and stores - defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + defvar VLDELatAndOcc = ConstValueUntilLMULThenDoubleBase<"M2", 3, 4, mx>.c; + let Latency = VLDELatAndOcc, ReleaseAtCycles = [VLDELatAndOcc] in { + defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; + } + defvar VSTELatAndOcc = GetLMULValue<[2, 2, 2, 3, 4, 8, 19], mx>.c; + let Latency = VSTELatAndOcc, ReleaseAtCycles = [VSTELatAndOcc] in { + defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + } + defvar VLDFFLatAndOcc = GetLMULValue<[4, 4, 4, 5, 7, 11, 19], mx>.c; + let Latency = VLDFFLatAndOcc, ReleaseAtCycles = [VLDFFLatAndOcc] in { + defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; + } // Mask loads and stores - defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; - defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + let ReleaseAtCycles = [2] in { + defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase>; + } + let Latency = 2, ReleaseAtCycles = [2] in { + defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase>; + } // Strided and indexed loads and stores foreach eew = [8, 16, 32, 64] in { - defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defvar StridedLdStLatAndOcc = SMX60GetVLMAX<mx, eew>.val; + let Latency = StridedLdStLatAndOcc, ReleaseAtCycles = [StridedLdStLatAndOcc] in { + defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; + } + + defvar IndexedLdStLatAndOcc = !div(SMX60GetVLMAX<mx, eew>.val, 2); + let Latency = IndexedLdStLatAndOcc, ReleaseAtCycles = [IndexedLdStLatAndOcc] in { + defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + } } } @@ -388,51 +440,67 @@ foreach mx = SchedMxList in { foreach eew = [8, 16, 32, 64] in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - // Unit-stride segmented - defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - - // Strided/indexed segmented - defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - - // Indexed segmented - defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defvar SegmentedLdStLatAndOcc = SMX60SegmentedLdStCycles<mx, eew, nf>.c; + let Latency = SegmentedLdStLatAndOcc, ReleaseAtCycles = [SegmentedLdStLatAndOcc] in { + // Unit-stride segmented + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Strided/indexed segmented + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Indexed segmented + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + } } } } // Whole register move/load/store foreach LMul = [1, 2, 4, 8] in { - def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>; - def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>; + defvar WholeRegLdStLatAndOcc = !if(!eq(LMul, 1), 3, !mul(LMul, 2)); + let Latency = WholeRegLdStLatAndOcc, ReleaseAtCycles = [WholeRegLdStLatAndOcc] in { + def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>; + def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>; + } - def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; + defvar VMovLatAndOcc = !if(!eq(LMul, 1), 4, !mul(LMul, 2)); + let Latency = VMovLatAndOcc, ReleaseAtCycles = [VMovLatAndOcc] in { + def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; + } } // 11. Vector Integer Arithmetic Instructions foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in { + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; } + // Latency of vadd, vsub, vrsub: 4/4/5/8 + // ReleaseAtCycles of vadd, vsub, vrsub: 1/2/4/8 + // Latency of vand, vor, vxor: 4/4/8/16 + // ReleaseAtCycles of vand, vor, vxor: 2/4/8/16 + // They are grouped together, so we used the worst case 4/4/8/16 and 2/4/8/16 + // TODO: use InstRW to override individual instructions' scheduling data defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; - let Latency = VIALULat, ReleaseAtCycles = [4] in { - // Pattern of vadd, vsub, vrsub: 4/4/5/8 - // Pattern of vand, vor, vxor: 4/4/8/16 - // They are grouped together, so we used the worst case 4/4/8/16 - // TODO: use InstRW to override individual instructions' scheduling data + defvar VIALUOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VIALULat, ReleaseAtCycles = [VIALUOcc] in { defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + } + defvar VILogicalLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VILogicalOcc = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c; + let Latency = VILogicalLat, ReleaseAtCycles = [VILogicalOcc] in { defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; @@ -449,7 +517,9 @@ foreach mx = SchedMxList in { defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; } - let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in { + // Slightly increase Occ when LMUL == M8 + defvar VICmpCarryOcc = GetLMULValue<[1, 1, 1, 2, 4, 8, 18], mx>.c; + let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VICmpCarryOcc] in { defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; @@ -458,10 +528,14 @@ foreach mx = SchedMxList in { defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; } - // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, + // Latency of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites - let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in { + defvar VIMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c; + // ReleaseAtCycles for vnmsac/vnmsub is 1/1/1/1/2/5 but we use the worse case + // here since they are grouped together with vmacc/vmadd/vmul/vmulh. + defvar VIMulOcc = ConstOneUntilM1ThenDouble<mx>.c; + let Latency = VIMulLat, ReleaseAtCycles = [VIMulOcc] in { defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; @@ -475,7 +549,8 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in { + defvar VIWideningOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [VIWideningOcc] in { defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; @@ -497,8 +572,9 @@ foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c; - let Latency = VIDivLat, ReleaseAtCycles = [12] in { + // Not pipelined + defvar VIDivLatAndOcc = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c; + let Latency = VIDivLatAndOcc, ReleaseAtCycles = [VIDivLatAndOcc] in { defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; } @@ -510,7 +586,8 @@ foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; - let Latency = VNarrowingLat, ReleaseAtCycles = [4] in { + defvar VNarrowingOcc = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c; + let Latency = VNarrowingLat, ReleaseAtCycles = [VNarrowingOcc] in { defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; @@ -558,39 +635,71 @@ foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, isF=1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; - } -} + defvar VFALULat = Get4458Latency<mx>.c; + defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c; + let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + } -foreach mx = SchedMxListF in { - foreach sew = SchedSEWSet<mx, isF=1>.val in { - defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + // Slightly increased latency for sew == 64 + defvar VFMulVLat = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c, + Get4458Latency<mx>.c); + let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + } + // VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles + let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + } - defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } - defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + // The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the + // ReleaseAtCycles takes one extra cycle for the vfn* variants. + // TODO: Should we split them? + // TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c + defvar VFMulAddLatency = !if(!eq(sew, 64), + Get6678Latency<mx>.c, + ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c + ); + let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + // Slightly increased ReleaseAtCycles for M8: 18 + defvar VFCmpOcc = !if(!eq(mx, "M8"), + !add(ConstOneUntilMF2ThenDouble<mx>.c, 2), + ConstOneUntilMF2ThenDouble<mx>.c + ); + let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in { + defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; + } - defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in { + defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + } } // Widening @@ -598,27 +707,73 @@ foreach mx = SchedMxListW in { foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c; + let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } foreach mx = SchedMxListFW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c; - defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c; + let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in { + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + } } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c; + let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } + + // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8 + // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8 + // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17 + // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17 + // We use the worst-case + defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17 + defvar VFWALUOcc = !if(!eq(mx, "M4"), + !add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17 + ConstOneUntilMF4ThenDouble<mx>.c + ); + // TODO: Split .wf/.wv variants into separate scheduling classes + let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + } + + let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + } + + // Slightly increased latency for SEW == 32 + defvar VFWMullOcc = !if(!eq(sew, 32), + GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c, + ConstOneUntilMF2ThenDouble<mx>.c + ); + defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c; + let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + } + + // Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8 + defvar VFWMulAddVLat = !if(!eq(sew, 16), + ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c, + Get6678Latency<mx>.c + ); + let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } @@ -626,15 +781,23 @@ foreach mx = SchedMxListFW in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c; + let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in { + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; + } } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + + defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c; + let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } @@ -643,9 +806,35 @@ foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, 1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + // Compute ReleaseAtCycles based on SEW + // Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144 + // Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320 + // We use the worst-case, vfdiv.vf is penalized in e64 + // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes + defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12); + defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor); + let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; + } + + defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40); + defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor); + let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; + } + } +} + +// Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320 +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40); + defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor); + let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } @@ -740,49 +929,103 @@ foreach mx = SchedMxListFWRed in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = 4 in { + defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; + } + let Latency = 4, ReleaseAtCycles = [ConstValueUntilLMULThenDouble<"M2", 1, mx>.c] in { + defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = 6, ReleaseAtCycles = [2] in { + defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; + } - defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; + defvar VIotaLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VIotaOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VIotaLat, ReleaseAtCycles = [VIotaOcc] in { + defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; + } } // 16. Vector Permutation Instructions +// Slide foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + // Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16 + defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + } - defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + // Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17 + defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c; + defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c; + let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; + } + // The following group slide up and down together, so we use the worst-case + // (slide down) for all. + let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + } } -def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; -def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; - -def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; -def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; +// ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now +// TODO: Split this into separate WriteRes for each MX +let Latency = 6, ReleaseAtCycles = [6] in { + def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; +} -// Gather and Compress -foreach mx = SchedMxList in { - foreach sew = SchedSEWSet<mx>.val in { - defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; - } +// ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now +// TODO: Split this into separate WriteRes for each MX +let Latency = 4, ReleaseAtCycles = [4] in { + def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; + def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; + def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; } +// Integer LMUL Gather and Compress foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; + defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; + } + + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c; + defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c; + let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + // For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles) + defvar VRGatherEI16Lat = !if(!eq(sew, 8), + GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c, + GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c); + defvar VRGatherEI16Occ = !if(!eq(sew, 8), + GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c, + GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c); + let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + + defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c; + defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c; + let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + } } // Others @@ -790,6 +1033,10 @@ def : WriteRes<WriteCSR, [SMX60_IEU]>; def : WriteRes<WriteNop, [SMX60_IEU]>; def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>; +// Give COPY instructions an execution resource. +// FIXME: This could be better modeled by looking at the regclasses of the operands. +def : InstRW<[WriteIALU], (instrs COPY)>; + //===----------------------------------------------------------------------===// // Bypass and advance def : ReadAdvance<ReadJmp, 0>; diff --git a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td index da89e15..08ee180 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td +++ b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td @@ -8,19 +8,106 @@ //===----------------------------------------------------------------------===// +class AscalonIsWorstCaseMX<string mx, list<string> MxList> { + defvar LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); +} + +class AscalonIsWorstCaseMXSEW<string mx, int sew, list<string> MxList, + bit isF = 0> { + defvar LLMUL = LargestLMUL<MxList>.r; + defvar SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + +/// Cycle counts that scale with LMUL with LMUL=1 having the same latency as +/// fractional LMULs +class AscalonGetCyclesLMUL<string mx, int base> { + int c = !cond( + !eq(mx, "M1") : base, + !eq(mx, "M2") : !mul(base, 2), + !eq(mx, "M4") : !mul(base, 4), + !eq(mx, "M8") : !mul(base, 8), + !eq(mx, "MF2") : base, + !eq(mx, "MF4") : base, + !eq(mx, "MF8") : base + ); +} + +/// Linear LMUL scaling starting from smallest fractional LMUL +class AscalonGetCyclesLMULFractional<string mx, int base> { + int c = !cond( + !eq(mx, "MF8") : base, + !eq(mx, "MF4") : !mul(base, 2), + !eq(mx, "MF2") : !mul(base, 4), + !eq(mx, "M1") : !mul(base, 8), + !eq(mx, "M2") : !mul(base, 16), + !eq(mx, "M4") : !mul(base, 32), + !eq(mx, "M8") : !mul(base, 64) + ); +} + +class AscalonGetCyclesDefault<string mx> { + int c = AscalonGetCyclesLMUL<mx, 1>.c; +} + +class AscalonGetCyclesNarrowing<string mx> { + int c = !cond( + !eq(mx, "M1") : 4, + !eq(mx, "M2") : 8, + !eq(mx, "M4") : 16, + !eq(mx, "MF2") : 2, + !eq(mx, "MF4") : 1, + !eq(mx, "MF8") : 1 + ); +} + + +class AscalonGetCyclesDivOrSqrt<string mx, int sew> { + int c = !cond( + !eq(sew, 8) : AscalonGetCyclesLMUL<mx, 7>.c, + !eq(sew, 16) : AscalonGetCyclesLMUL<mx, 6>.c, + !eq(sew, 32) : AscalonGetCyclesLMUL<mx, 5>.c, + !eq(sew, 64) : AscalonGetCyclesLMUL<mx, 8>.c + ); +} + +class AscalonGetCyclesVRGatherVV<string mx> { + int c = !cond( + !eq(mx, "M1") : 2, + !eq(mx, "M2") : 4, + !eq(mx, "M4") : 12, + !eq(mx, "M8") : 48, + !eq(mx, "MF2") : 2, + !eq(mx, "MF4") : 2, + !eq(mx, "MF8") : 2 + ); +} + +class AscalonGetCyclesStridedSegmented<string mx, int sew> { + int c = !cond( + !eq(sew, 8) : AscalonGetCyclesLMULFractional<mx, 4>.c, + !eq(sew, 16) : AscalonGetCyclesLMULFractional<mx, 2>.c, + !eq(sew, 32) : AscalonGetCyclesLMULFractional<mx, 1>.c, + !eq(sew, 64) : AscalonGetCyclesLMULFractional<mx, 1>.c + ); +} + +//===----------------------------------------------------------------------===// + def TTAscalonD8Model : SchedMachineModel { let IssueWidth = 8; // 8-way decode and dispatch let MicroOpBufferSize = 256; // 256 micro-op re-order buffer let LoadLatency = 4; // Optimistic load latency let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch - let CompleteModel = 0; + let CompleteModel = false; // TODO: supported, but haven't added scheduling info yet. let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx, HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne, HasStdExtZknh, HasStdExtZksed, HasStdExtZksh, - HasStdExtZkr, HasVInstructions, HasVInstructionsI64]; + HasStdExtZkr]; } let SchedModel = TTAscalonD8Model in { @@ -34,11 +121,17 @@ let BufferSize = 16 in { def AscalonFXB : ProcResource<1>; // ALU, INT -> FP/VEC def AscalonFXC : ProcResource<2>; // ALU, BR def AscalonFXD : ProcResource<2>; // ALU - def AscalonFP : ProcResource<2>; - // TODO: two vector units with vector scheduling model. + def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>; + // FP + def AscalonFPA : ProcResource<1>; // Pipe A also handles FP/VEC -> INT + def AscalonFPB : ProcResource<1>; + def AscalonFP : ProcResGroup<[AscalonFPA, AscalonFPB]>; + // Vector + def AscalonVA : ProcResource<1>; + def AscalonVB : ProcResource<1>; + def AscalonV : ProcResGroup<[AscalonVA, AscalonVB]>; } -def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>; //===----------------------------------------------------------------------===// @@ -317,9 +410,624 @@ def : ReadAdvance<ReadSingleBit, 0>; def : ReadAdvance<ReadSingleBitImm, 0>; //===----------------------------------------------------------------------===// +// Vector +def : WriteRes<WriteRdVLENB, [AscalonFXA]>; + +// Configuration-Setting Instructions +def : WriteRes<WriteVSETVLI, [AscalonV]>; +def : WriteRes<WriteVSETIVLI, [AscalonV]>; +let Latency = 2 in { + def : WriteRes<WriteVSETVL, [AscalonV]>; +} + +// Vector Loads and Stores +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLDE", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [AscalonLS], mx, IsWorstCase>; + } + defm "" : LMULWriteResMX<"WriteVSTE", [AscalonLS], mx, IsWorstCase>; +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + defm "" : LMULWriteResMX<"WriteVLDM", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTM", [AscalonLS], mx, IsWorstCase>; +} + +foreach mx = SchedMxListEEW8 in { + defvar Cycles = AscalonGetCyclesLMUL<mx, 2>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLDS8", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX8", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX8", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTS8", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX8", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX8", [AscalonLS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxListEEW16 in { + defvar Cycles = AscalonGetCyclesLMUL<mx, 2>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLDS16", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX16", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX16", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTS16", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX16", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX16", [AscalonLS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxListEEW32 in { + defvar Cycles = AscalonGetCyclesLMUL<mx, 2>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLDS32", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX32", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX32", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTS32", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX32", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX32", [AscalonLS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxListEEW64 in { + defvar Cycles = AscalonGetCyclesLMUL<mx, 2>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLDS64", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX64", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX64", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTS64", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX64", [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX64", [AscalonLS], mx, IsWorstCase>; + } +} + +// VLD*R is not LMUL aware +def : WriteRes<WriteVLD1R, [AscalonLS]>; +def : WriteRes<WriteVLD2R, [AscalonLS]>; +def : WriteRes<WriteVLD4R, [AscalonLS]>; +def : WriteRes<WriteVLD8R, [AscalonLS]>; +// VST*R is not LMUL aware +def : WriteRes<WriteVST1R, [AscalonLS]>; +def : WriteRes<WriteVST2R, [AscalonLS]>; +def : WriteRes<WriteVST4R, [AscalonLS]>; +def : WriteRes<WriteVST8R, [AscalonLS]>; + +// Segmented Loads and Stores +foreach mx = SchedMxList in { + foreach eew = [8, 16, 32, 64] in { + foreach nf=2-8 in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + } + let Latency = 1, AcquireAtCycles = [1], ReleaseAtCycles = [!add(1, Cycles)] in + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + } + } +} +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar Cycles = AscalonGetCyclesStridedSegmented<mx, eew>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles in { + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [AscalonLS], mx, IsWorstCase>; + } + } + } +} + +// Vector Fixed-Point Arithmetic Instructions +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVSALUV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulV", [AscalonFXA, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [AscalonFXA, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} +// Narrowing +foreach mx = SchedMxListW in { + defvar Cycles = AscalonGetCyclesNarrowing<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVNClipV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} + +// Configuration-Setting Instructions +def : ReadAdvance<ReadVSETVLI, 1>; +def : ReadAdvance<ReadVSETVL, 1>; + +// Vector Loads and Stores +def : ReadAdvance<ReadVLDX, 0>; +def : ReadAdvance<ReadVSTX, 0>; +defm "" : LMULReadAdvance<"ReadVSTEV", 0>; +defm "" : LMULReadAdvance<"ReadVSTM", 0>; +def : ReadAdvance<ReadVLDSX, 0>; +def : ReadAdvance<ReadVSTSX, 0>; +defm "" : LMULReadAdvance<"ReadVSTS8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS64V", 0>; +defm "" : LMULReadAdvance<"ReadVLDUXV", 0>; +defm "" : LMULReadAdvance<"ReadVLDOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTUXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>; +// LMUL Aware +def : ReadAdvance<ReadVST1R, 0>; +def : ReadAdvance<ReadVST2R, 0>; +def : ReadAdvance<ReadVST4R, 0>; +def : ReadAdvance<ReadVST8R, 0>; + +// Vector Integer Arithmetic Instructions +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVIALUV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVExtV", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [AscalonFX, AscalonV], mx, sew, IsWorstCase>; + } + } +} + +// Widening +foreach mx = SchedMxListW in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVIWALUV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} +// Narrowing +foreach mx = SchedMxListW in { + defvar Cycles = AscalonGetCyclesNarrowing<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVNShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} + +// Vector Floating-Point Instructions +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + } + } +} +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFClassV", [AscalonFP, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [AscalonFP, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [AscalonFP, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpV", [AscalonFP, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [AscalonFP, AscalonV], mx, IsWorstCase>; + } +} +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + } + } +} + +// Widening +foreach mx = SchedMxListW in { + foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>; + } +} +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>; + } + } + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListFW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>; +} +// Narrowing +foreach mx = SchedMxListW in { + defvar Cycles = AscalonGetCyclesNarrowing<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>; + } +} +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + defvar Cycles = AscalonGetCyclesNarrowing<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>; + } + } +} + +// Vector Reduction Instructions +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + } + } +} + +foreach mx = SchedMxListWRed in { + foreach sew = SchedSEWSet<mx, 0, 1>.val in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar RedCycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + let Latency = RedCycles, ReleaseAtCycles = [1, RedCycles] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + } + defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c; + let Latency = OrdRedCycles, ReleaseAtCycles = [1, OrdRedCycles] in + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFWRed in { + foreach sew = SchedSEWSet<mx, 1, 1>.val in { + defvar RedCycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c; + let Latency = RedCycles, ReleaseAtCycles = [1, RedCycles] in + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c; + let Latency = OrdRedCycles, ReleaseAtCycles = [1, OrdRedCycles] in + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [AscalonFX, AscalonV], + mx, sew, IsWorstCase>; + } +} + +// Vector Mask Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + defm "" : LMULWriteResMX<"WriteVMALUV", [AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMSFSV", [AscalonV], mx, IsWorstCase>; + let Latency = 2, ReleaseAtCycles = [1, 2] in { + defm "" : LMULWriteResMX<"WriteVMPopV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = Cycles, ReleaseAtCycles = [1, Cycles] in { + defm "" : LMULWriteResMX<"WriteVIotaV", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} + +// Vector Permutation Instructions +let Latency = 2, ReleaseAtCycles = [1, 2] in { + def : WriteRes<WriteVMovSX, [AscalonFX, AscalonV]>; + def : WriteRes<WriteVMovXS, [AscalonFX, AscalonV]>; + def : WriteRes<WriteVMovSF, [AscalonFX, AscalonV]>; + def : WriteRes<WriteVMovFS, [AscalonFX, AscalonV]>; +} +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = !mul(Cycles, 2), ReleaseAtCycles = [Cycles, !mul(Cycles, 2)] in { + defm "" : LMULWriteResMX<"WriteVRGatherVX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar Cycles = AscalonGetCyclesVRGatherVV<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + let Latency = !add(Cycles, 3), ReleaseAtCycles = [1, !add(1, Cycles)] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>; + } + } +} + +foreach mx = SchedMxList in { + defvar Cycles = AscalonGetCyclesDefault<mx>.c; + defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; + let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { + defm "" : LMULWriteResMX<"WriteVSlideUpX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideDownX", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideI", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVISlide1X", [AscalonFX, AscalonV], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [AscalonFX, AscalonV], mx, IsWorstCase>; + } +} + +// Whole vector register move, vmv<N>.v, not LMUL aware +let Latency = 1, ReleaseAtCycles = [1] in + def : WriteRes<WriteVMov1V, [AscalonV]>; +let Latency = 2, ReleaseAtCycles = [2] in + def : WriteRes<WriteVMov2V, [AscalonV]>; +let Latency = 4, ReleaseAtCycles = [4] in + def : WriteRes<WriteVMov4V, [AscalonV]>; +let Latency = 8, ReleaseAtCycles = [8] in + def : WriteRes<WriteVMov8V, [AscalonV]>; + +// Vector Integer Arithmetic Instructions +defm : LMULReadAdvance<"ReadVIALUV", 0>; +defm : LMULReadAdvance<"ReadVIALUX", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUV", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUX", 0>; +defm : LMULReadAdvance<"ReadVExtV", 0>; +defm : LMULReadAdvance<"ReadVICALUV", 0>; +defm : LMULReadAdvance<"ReadVICALUX", 0>; +defm : LMULReadAdvance<"ReadVShiftV", 0>; +defm : LMULReadAdvance<"ReadVShiftX", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftV", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftX", 0>; +defm : LMULReadAdvance<"ReadVICmpV", 0>; +defm : LMULReadAdvance<"ReadVICmpX", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxV", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxX", 0>; +defm : LMULReadAdvance<"ReadVIMulV", 0>; +defm : LMULReadAdvance<"ReadVIMulX", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivV", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulX", 0>; +defm : LMULReadAdvance<"ReadVIMulAddV", 0>; +defm : LMULReadAdvance<"ReadVIMulAddX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>; +defm : LMULReadAdvance<"ReadVIMergeV", 0>; +defm : LMULReadAdvance<"ReadVIMergeX", 0>; +defm : LMULReadAdvance<"ReadVIMovV", 0>; +defm : LMULReadAdvance<"ReadVIMovX", 0>; + +// Vector Fixed-Point Arithmetic Instructions +defm "" : LMULReadAdvance<"ReadVSALUV", 0>; +defm "" : LMULReadAdvance<"ReadVSALUX", 0>; +defm "" : LMULReadAdvance<"ReadVAALUV", 0>; +defm "" : LMULReadAdvance<"ReadVAALUX", 0>; +defm "" : LMULReadAdvance<"ReadVSMulV", 0>; +defm "" : LMULReadAdvance<"ReadVSMulX", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftV", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftX", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>; + +// Vector Floating-Point Instructions +defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpF", 0>; +defm "" : LMULReadAdvance<"ReadVFClassV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeF", 0>; +defm "" : LMULReadAdvance<"ReadVFMovF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>; +defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; +defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; +defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; + +// Vector Reduction Instructions +def : ReadAdvance<ReadVIRedV, 0>; +def : ReadAdvance<ReadVIRedV0, 0>; +def : ReadAdvance<ReadVIWRedV, 0>; +def : ReadAdvance<ReadVIWRedV0, 0>; +def : ReadAdvance<ReadVFRedV, 0>; +def : ReadAdvance<ReadVFRedV0, 0>; +def : ReadAdvance<ReadVFRedOV, 0>; +def : ReadAdvance<ReadVFRedOV0, 0>; +def : ReadAdvance<ReadVFWRedV, 0>; +def : ReadAdvance<ReadVFWRedV0, 0>; +def : ReadAdvance<ReadVFWRedOV, 0>; +def : ReadAdvance<ReadVFWRedOV0, 0>; + +// Vector Mask Instructions +defm "" : LMULReadAdvance<"ReadVMALUV", 0>; +defm "" : LMULReadAdvance<"ReadVMPopV", 0>; +defm "" : LMULReadAdvance<"ReadVMFFSV", 0>; +defm "" : LMULReadAdvance<"ReadVMSFSV", 0>; +defm "" : LMULReadAdvance<"ReadVIotaV", 0>; + +// Vector Permutation Instructions +def : ReadAdvance<ReadVMovXS, 0>; +def : ReadAdvance<ReadVMovSX_V, 0>; +def : ReadAdvance<ReadVMovSX_X, 0>; +def : ReadAdvance<ReadVMovFS, 0>; +def : ReadAdvance<ReadVMovSF_V, 0>; +def : ReadAdvance<ReadVMovSF_F, 0>; +defm "" : LMULReadAdvance<"ReadVISlideV", 0>; +defm "" : LMULReadAdvance<"ReadVISlideX", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideV", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideF", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>; +// LMUL Aware +def : ReadAdvance<ReadVMov1V, 0>; +def : ReadAdvance<ReadVMov2V, 0>; +def : ReadAdvance<ReadVMov4V, 0>; +def : ReadAdvance<ReadVMov8V, 0>; + +// Others +def : ReadAdvance<ReadVMask, 0>; +def : ReadAdvance<ReadVPassthru_WorstCase, 0>; +foreach mx = SchedMxList in { + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>; + foreach sew = SchedSEWSet<mx>.val in + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>; +} + +//===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index d11b446..601308b 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -19,6 +19,14 @@ defvar SchedMxListFW = !listremove(SchedMxList, ["M8", "MF8"]); defvar SchedMxListF = !listremove(SchedMxList, ["MF8"]); // Used for widening floating-point Reduction as it doesn't contain MF8. defvar SchedMxListFWRed = SchedMxListF; +// Used for indexed and strided loads of 8 bit lanes, same as full MX list +defvar SchedMxListEEW8 = SchedMxList; +// Used for indexed and strided loads of 16 bit lanes +defvar SchedMxListEEW16 = SchedMxListF; +// Used for indexed and strided loads of 32 bit lanes +defvar SchedMxListEEW32 = !listremove(SchedMxListEEW16, ["MF4"]); +// Used for indexed and strided loads of 64 bit lanes +defvar SchedMxListEEW64 = !listremove(SchedMxListEEW32, ["MF2"]); class SchedSEWSet<string mx, bit isF = 0, bit isWidening = 0> { assert !or(!not(isF), !ne(mx, "MF8")), "LMUL shouldn't be MF8 for floating-point"; diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp index 041dd07..8b66aa1 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp @@ -22,27 +22,22 @@ RISCVSelectionDAGInfo::~RISCVSelectionDAGInfo() = default; void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const { + SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + #ifndef NDEBUG + // Some additional checks not yet implemented by verifyTargetNode. switch (N->getOpcode()) { - default: - return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); case RISCVISD::TUPLE_EXTRACT: - assert(N->getNumOperands() == 2 && "Expected three operands!"); assert(N->getOperand(1).getOpcode() == ISD::TargetConstant && - N->getOperand(1).getValueType() == MVT::i32 && - "Expected index to be an i32 target constant!"); + "Expected index to be a target constant!"); break; case RISCVISD::TUPLE_INSERT: - assert(N->getNumOperands() == 3 && "Expected three operands!"); assert(N->getOperand(2).getOpcode() == ISD::TargetConstant && - N->getOperand(2).getValueType() == MVT::i32 && - "Expected index to be an i32 target constant!"); + "Expected index to be a target constant!"); break; case RISCVISD::VQDOT_VL: case RISCVISD::VQDOTU_VL: case RISCVISD::VQDOTSU_VL: { - assert(N->getNumValues() == 1 && "Expected one result!"); - assert(N->getNumOperands() == 5 && "Expected five operands!"); EVT VT = N->getValueType(0); assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i32 && "Expected result to be an i32 scalable vector"); @@ -52,13 +47,9 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, "Expected result and first 3 operands to have the same type!"); EVT MaskVT = N->getOperand(3).getValueType(); assert(MaskVT.isScalableVector() && - MaskVT.getVectorElementType() == MVT::i1 && MaskVT.getVectorElementCount() == VT.getVectorElementCount() && "Expected mask VT to be an i1 scalable vector with same number of " "elements as the result"); - assert((N->getOperand(4).getValueType() == MVT::i32 || - N->getOperand(4).getValueType() == MVT::i64) && - "Expect VL operand to be i32 or i64"); break; } } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 715ac4c..f86265a 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -69,6 +69,12 @@ static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov", cl::desc("Use 'mips.ccmov' instruction"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnablePExtCodeGen( + "enable-p-ext-codegen", + cl::desc("Turn on P Extension codegen(This is a temporary switch where " + "only partial codegen is currently supported)"), + cl::init(false), cl::Hidden); + void RISCVSubtarget::anchor() {} RISCVSubtarget & @@ -82,6 +88,8 @@ RISCVSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef CPU, if (TuneCPU.empty()) TuneCPU = CPU; + if (TuneCPU == "generic") + TuneCPU = Is64Bit ? "generic-rv64" : "generic-rv32"; TuneInfo = RISCVTuneInfoTable::getRISCVTuneInfo(TuneCPU); // If there is no TuneInfo for this CPU, we fail back to generic. @@ -104,7 +112,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique<RISCVSelectionDAGInfo>(); } @@ -145,6 +153,10 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const { return !RISCVDisableUsingConstantPoolForLargeInts; } +bool RISCVSubtarget::enablePExtCodeGen() const { + return HasStdExtP && EnablePExtCodeGen; +} + unsigned RISCVSubtarget::getMaxBuildIntsCost() const { // Loading integer from constant pool needs two instructions (the reason why // the minimum cost is 2): an address calculation instruction and a load diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b4fc8f..ae6ca97 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -112,7 +112,6 @@ private: RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; - RISCVRegisterInfo RegInfo; RISCVTargetLowering TLInfo; /// Initializes using the passed in CPU and feature strings so that we can @@ -140,7 +139,7 @@ public: } const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; } const RISCVRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const RISCVTargetLowering *getTargetLowering() const override { return &TLInfo; @@ -187,7 +186,7 @@ public: } bool hasCLZLike() const { - return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb || + return HasStdExtZbb || HasVendorXTHeadBb || (HasVendorXCVbitmanip && !IsRV64); } bool hasCTZLike() const { @@ -197,7 +196,7 @@ public: return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64); } bool hasREV8Like() const { - return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb; + return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb; } bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; } @@ -209,7 +208,7 @@ public: bool hasConditionalMoveFusion() const { // Do we support fusing a branch+mv or branch+c.mv as a conditional move. return (hasConditionalCompressedMoveFusion() && hasStdExtZca()) || - hasShortForwardBranchOpt(); + hasShortForwardBranchIALU(); } bool hasShlAdd(int64_t ShAmt) const { @@ -238,6 +237,13 @@ public: return 0; } + + Align getZilsdAlign() const { + return Align(enableUnalignedScalarMem() ? 1 + : allowZilsd4ByteAlign() ? 4 + : 8); + } + unsigned getELen() const { assert(hasVInstructions() && "Expected V extension"); return hasVInstructionsI64() ? 64 : 32; @@ -322,6 +328,8 @@ public: } } + bool enablePExtCodeGen() const; + // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. unsigned getDLenFactor() const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 16ef67d..52dc385 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -103,6 +103,11 @@ static cl::opt<bool> cl::desc("Enable Machine Pipeliner for RISC-V"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableCFIInstrInserter( + "riscv-enable-cfi-instr-inserter", + cl::desc("Enable CFI Instruction Inserter for RISC-V"), cl::init(false), + cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target()); RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target()); @@ -118,7 +123,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVLateBranchOptPass(*PR); initializeRISCVMakeCompressibleOptPass(*PR); initializeRISCVGatherScatterLoweringPass(*PR); - initializeRISCVCodeGenPreparePass(*PR); + initializeRISCVCodeGenPrepareLegacyPassPass(*PR); initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); @@ -136,6 +141,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPushPopOptPass(*PR); initializeRISCVIndirectBranchTrackingPass(*PR); initializeRISCVLoadStoreOptPass(*PR); + initializeRISCVPreAllocZilsdOptPass(*PR); initializeRISCVExpandAtomicPseudoPass(*PR); initializeRISCVRedundantCopyEliminationPass(*PR); initializeRISCVAsmPrinterPass(*PR); @@ -169,7 +175,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, if (TT.isOSFuchsia() && !TT.isArch64Bit()) report_fatal_error("Fuchsia is only supported for 64-bit"); - setCFIFixup(true); + setCFIFixup(!EnableCFIInstrInserter); } const RISCVSubtarget * @@ -456,7 +462,7 @@ void RISCVPassConfig::addIRPasses() { addPass(createRISCVGatherScatterLoweringPass()); addPass(createInterleavedAccessPass()); - addPass(createRISCVCodeGenPreparePass()); + addPass(createRISCVCodeGenPrepareLegacyPass()); } TargetPassConfig::addIRPasses(); @@ -578,6 +584,9 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createUnpackMachineBundles([&](const MachineFunction &MF) { return MF.getFunction().getParent()->getModuleFlag("kcfi"); })); + + if (EnableCFIInstrInserter) + addPass(createCFIInstrInserter()); } void RISCVPassConfig::addMachineSSAOptimization() { @@ -596,6 +605,8 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createRISCVVLOptimizerPass()); + // Add Zilsd pre-allocation load/store optimization + addPass(createRISCVPreAllocZilsdOptPass()); } addPass(createRISCVInsertReadWriteCSRPass()); @@ -628,6 +639,9 @@ bool RISCVPassConfig::addILPOpts() { } void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +#define GET_PASS_REGISTRY "RISCVPassRegistry.def" +#include "llvm/Passes/TargetPassRegistry.inc" + PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM, OptimizationLevel Level) { if (Level != OptimizationLevel::O0) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 3d8eb40..bb469e9 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( if (isa<ScalableVectorType>(Ty)) return InstructionCost::getInvalid(); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) { + return 1; // Treat as single instruction cost for now + } + // A build_vector (which is m1 sized or smaller) can be done in no // worse than one vslide1down.vx per element in the type. We could // in theory do an explode_vector in the inverse manner, but our @@ -1001,13 +1008,52 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( } InstructionCost -RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +RISCVTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + switch (MICA.getID()) { + case Intrinsic::vp_load_ff: { + EVT DataTypeVT = TLI->getValueType(DL, DataTy); + if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment)) + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); + + unsigned AS = MICA.getAddressSpace(); + return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, nullptr); + } + case Intrinsic::experimental_vp_strided_load: + case Intrinsic::experimental_vp_strided_store: + return getStridedMemoryOpCost(MICA, CostKind); + case Intrinsic::masked_compressstore: + case Intrinsic::masked_expandload: + return getExpandCompressMemoryOpCost(MICA, CostKind); + case Intrinsic::vp_scatter: + case Intrinsic::vp_gather: + case Intrinsic::masked_scatter: + case Intrinsic::masked_gather: + return getGatherScatterOpCost(MICA, CostKind); + case Intrinsic::vp_load: + case Intrinsic::vp_store: + case Intrinsic::masked_load: + case Intrinsic::masked_store: + return getMaskedMemoryOpCost(MICA, CostKind); + } + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); +} + +InstructionCost +RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + if (!isLegalMaskedLoadStore(Src, Alignment) || CostKind != TTI::TCK_RecipThroughput) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } @@ -1109,19 +1155,24 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( return MemCost + ShuffleCost; } -InstructionCost RISCVTTIImpl::getGatherScatterOpCost( - unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { +InstructionCost +RISCVTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + + bool IsLoad = MICA.getID() == Intrinsic::masked_gather || + MICA.getID() == Intrinsic::vp_gather; + unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store; + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + const Instruction *I = MICA.getInst(); if (CostKind != TTI::TCK_RecipThroughput) - return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); if ((Opcode == Instruction::Load && !isLegalMaskedGather(DataTy, Align(Alignment))) || (Opcode == Instruction::Store && !isLegalMaskedScatter(DataTy, Align(Alignment)))) - return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); // Cost is proportional to the number of memory operations implied. For // scalable vectors, we use an estimate on that number since we don't @@ -1135,15 +1186,20 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost( } InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost( - unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, - TTI::TargetCostKind CostKind, const Instruction *I) const { + const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload + ? Instruction::Load + : Instruction::Store; + Type *DataTy = MICA.getDataType(); + bool VariableMask = MICA.getVariableMask(); + Align Alignment = MICA.getAlignment(); bool IsLegal = (Opcode == Instruction::Store && isLegalMaskedCompressStore(DataTy, Alignment)) || (Opcode == Instruction::Load && isLegalMaskedExpandLoad(DataTy, Alignment)); if (!IsLegal || CostKind != TTI::TCK_RecipThroughput) - return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask, - Alignment, CostKind, I); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); // Example compressstore sequence: // vsetivli zero, 8, e32, m2, ta, ma (ignored) // vcompress.vm v10, v8, v0 @@ -1172,14 +1228,20 @@ InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost( LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); } -InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( - unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { - if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && - !isLegalStridedLoadStore(DataTy, Alignment)) || - (Opcode != Instruction::Load && Opcode != Instruction::Store)) - return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); +InstructionCost +RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + + unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load + ? Instruction::Load + : Instruction::Store; + + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + const Instruction *I = MICA.getInst(); + + if (!isLegalStridedLoadStore(DataTy, Alignment)) + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); if (CostKind == TTI::TCK_CodeSize) return TTI::TCC_Basic; @@ -1497,6 +1559,23 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::fshl: + case Intrinsic::fshr: { + if (ICA.getArgs().empty()) + break; + + // Funnel-shifts are ROTL/ROTR when the first and second operand are equal. + // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W) + // instruction. + if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() && + ICA.getArgs()[0] == ICA.getArgs()[1] && + (RetTy->getIntegerBitWidth() == 32 || + RetTy->getIntegerBitWidth() == 64) && + RetTy->getIntegerBitWidth() <= ST->getXLen()) { + return 1; + } + break; + } case Intrinsic::get_active_lane_mask: { if (ST->hasVInstructions()) { Type *ExpRetTy = VectorType::get( @@ -1543,16 +1622,6 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return Cost; } - case Intrinsic::experimental_vp_splat: { - auto LT = getTypeLegalizationCost(RetTy); - // TODO: Lower i1 experimental_vp_splat - if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1) - return InstructionCost::getInvalid(); - return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint() - ? RISCV::VFMV_V_F - : RISCV::VMV_V_X, - LT.second, CostKind); - } case Intrinsic::experimental_vp_splice: { // To support type-based query from vectorizer, set the index to 0. // Note that index only change the cost from vslide.vx to vslide.vi and in @@ -1625,6 +1694,14 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!IsVectorType) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && + (isa<FixedVectorType>(Dst) || isa<FixedVectorType>(Src))) { + return 1; // Treat as single instruction cost for now + } + // FIXME: Need to compute legalizing cost for illegal types. The current // code handles only legal types and those which can be trivially // promoted to legal. @@ -2323,6 +2400,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) { + return 1; // Treat as single instruction cost for now + } + if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); @@ -2708,7 +2792,10 @@ void RISCVTTIImpl::getUnrollingPreferences( // Both auto-vectorized loops and the scalar remainder have the // isvectorized attribute, so differentiate between them by the presence // of vector instructions. - if (IsVectorized && I.getType()->isVectorTy()) + if (IsVectorized && (I.getType()->isVectorTy() || + llvm::any_of(I.operand_values(), [](Value *V) { + return V->getType()->isVectorTy(); + }))) return; if (isa<CallInst>(I) || isa<InvokeInst>(I)) { @@ -3322,11 +3409,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands( if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; - // We are looking for a splat/vp.splat that can be sunk. - bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>( - m_Value(), m_Value(), m_Value())); - if (!IsVPSplat && - !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + // We are looking for a splat that can be sunk. + if (!match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), m_Value(), m_ZeroMask()))) continue; @@ -3343,16 +3427,11 @@ bool RISCVTTIImpl::isProfitableToSinkOperands( } // Sink any fpexts since they might be used in a widening fp pattern. - if (IsVPSplat) { - if (isa<FPExtInst>(Op->getOperand(0))) - Ops.push_back(&Op->getOperandUse(0)); - } else { - Use *InsertEltUse = &Op->getOperandUse(0); - auto *InsertElt = cast<InsertElementInst>(InsertEltUse); - if (isa<FPExtInst>(InsertElt->getOperand(1))) - Ops.push_back(&InsertElt->getOperandUse(1)); - Ops.push_back(InsertEltUse); - } + Use *InsertEltUse = &Op->getOperandUse(0); + auto *InsertElt = cast<InsertElementInst>(InsertEltUse); + if (isa<FPExtInst>(InsertElt->getOperand(1))) + Ops.push_back(&InsertElt->getOperandUse(1)); + Ops.push_back(InsertEltUse); Ops.push_back(&OpIdx.value()); } return true; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e896..e6b75d7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -144,9 +144,11 @@ public: bool shouldConsiderVectorizationRegPressure() const override { return true; } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind) const override; + getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const override; + + InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base, @@ -191,22 +193,15 @@ public: Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false) const override; - InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - const Value *Ptr, bool VariableMask, - Align Alignment, - TTI::TargetCostKind CostKind, - const Instruction *I) const override; + InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; InstructionCost - getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, - const Instruction *I = nullptr) const override; + getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; - InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, - const Value *Ptr, bool VariableMask, - Align Alignment, - TTI::TargetCostKind CostKind, - const Instruction *I) const override; + InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const; InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const override; @@ -286,11 +281,13 @@ public: } bool isLegalMaskedLoad(Type *DataType, Align Alignment, - unsigned /*AddressSpace*/) const override { + unsigned /*AddressSpace*/, + TTI::MaskKind /*MaskKind*/) const override { return isLegalMaskedLoadStore(DataType, Alignment); } bool isLegalMaskedStore(Type *DataType, Align Alignment, - unsigned /*AddressSpace*/) const override { + unsigned /*AddressSpace*/, + TTI::MaskKind /*MaskKind*/) const override { return isLegalMaskedLoadStore(DataType, Alignment); } diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 0a8838c..638bf12 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -62,7 +62,7 @@ struct DemandedVL { }; class RISCVVLOptimizer : public MachineFunctionPass { - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; const MachineDominatorTree *MDT; const TargetInstrInfo *TII; @@ -85,7 +85,7 @@ private: DemandedVL getMinimumVLForUser(const MachineOperand &UserOp) const; /// Returns true if the users of \p MI have compatible EEWs and SEWs. bool checkUsers(const MachineInstr &MI) const; - bool tryReduceVL(MachineInstr &MI) const; + bool tryReduceVL(MachineInstr &MI, MachineOperand VL) const; bool isCandidate(const MachineInstr &MI) const; void transfer(const MachineInstr &MI); @@ -1392,6 +1392,42 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } +/// Given a vslidedown.vx like: +/// +/// %slideamt = ADDI %x, -1 +/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1 +/// +/// %v will only read the first %slideamt + 1 lanes of %src, which = %x. +/// This is a common case when lowering extractelement. +/// +/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be +/// completely slid down and none of its lanes will be read (since %slideamt is +/// greater than the largest VLMAX of 65536) so we can demand any minimum VL. +static std::optional<DemandedVL> +getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp, + const MachineRegisterInfo *MRI) { + const MachineInstr &MI = *UserOp.getParent(); + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX) + return std::nullopt; + // We're looking at what lanes are used from the src operand. + if (UserOp.getOperandNo() != 2) + return std::nullopt; + // For now, the AVL must be 1. + const MachineOperand &AVL = MI.getOperand(4); + if (!AVL.isImm() || AVL.getImm() != 1) + return std::nullopt; + // The slide amount must be %x - 1. + const MachineOperand &SlideAmt = MI.getOperand(3); + if (!SlideAmt.getReg().isVirtual()) + return std::nullopt; + MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg()); + if (SlideAmtDef->getOpcode() != RISCV::ADDI || + SlideAmtDef->getOperand(2).getImm() != -AVL.getImm() || + !SlideAmtDef->getOperand(1).getReg().isVirtual()) + return std::nullopt; + return SlideAmtDef->getOperand(1); +} + DemandedVL RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { const MachineInstr &UserMI = *UserOp.getParent(); @@ -1406,6 +1442,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return DemandedVL::vlmax(); } + if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI)) + return *VL; + if (RISCVII::readsPastVL( TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); @@ -1568,7 +1607,8 @@ bool RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { return true; } -bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { +bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI, + MachineOperand CommonVL) const { LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI); unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); @@ -1581,49 +1621,47 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { return false; } - auto *CommonVL = &DemandedVLs.at(&MI).VL; - - assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && + assert((CommonVL.isImm() || CommonVL.getReg().isVirtual()) && "Expected VL to be an Imm or virtual Reg"); // If the VL is defined by a vleff that doesn't dominate MI, try using the // vleff's AVL. It will be greater than or equal to the output VL. - if (CommonVL->isReg()) { - const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); + if (CommonVL.isReg()) { + const MachineInstr *VLMI = MRI->getVRegDef(CommonVL.getReg()); if (RISCVInstrInfo::isFaultOnlyFirstLoad(*VLMI) && !MDT->dominates(VLMI, &MI)) - CommonVL = &VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc())); + CommonVL = VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc())); } - if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) { + if (!RISCV::isVLKnownLE(CommonVL, VLOp)) { LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); return false; } - if (CommonVL->isIdenticalTo(VLOp)) { + if (CommonVL.isIdenticalTo(VLOp)) { LLVM_DEBUG( dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n"); return false; } - if (CommonVL->isImm()) { + if (CommonVL.isImm()) { LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " - << CommonVL->getImm() << " for " << MI << "\n"); - VLOp.ChangeToImmediate(CommonVL->getImm()); + << CommonVL.getImm() << " for " << MI << "\n"); + VLOp.ChangeToImmediate(CommonVL.getImm()); return true; } - const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); + const MachineInstr *VLMI = MRI->getVRegDef(CommonVL.getReg()); if (!MDT->dominates(VLMI, &MI)) { LLVM_DEBUG(dbgs() << " Abort due to VL not dominating.\n"); return false; } - LLVM_DEBUG( - dbgs() << " Reduce VL from " << VLOp << " to " - << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo()) - << " for " << MI << "\n"); + LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " + << printReg(CommonVL.getReg(), MRI->getTargetRegisterInfo()) + << " for " << MI << "\n"); // All our checks passed. We can reduce VL. - VLOp.ChangeToRegister(CommonVL->getReg(), false); + VLOp.ChangeToRegister(CommonVL.getReg(), false); + MRI->constrainRegClass(CommonVL.getReg(), &RISCV::GPRNoX0RegClass); return true; } @@ -1678,18 +1716,13 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { // Then go through and see if we can reduce the VL of any instructions to // only what's demanded. bool MadeChange = false; - for (MachineBasicBlock &MBB : MF) { - // Avoid unreachable blocks as they have degenerate dominance - if (!MDT->isReachableFromEntry(&MBB)) + for (auto &[MI, VL] : DemandedVLs) { + assert(MDT->isReachableFromEntry(MI->getParent())); + if (!isCandidate(*MI)) continue; - - for (auto &MI : reverse(MBB)) { - if (!isCandidate(MI)) - continue; - if (!tryReduceVL(MI)) - continue; - MadeChange = true; - } + if (!tryReduceVL(*const_cast<MachineInstr *>(MI), VL.VL)) + continue; + MadeChange = true; } DemandedVLs.clear(); diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index fdf9a4f..a5385be 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -73,7 +73,7 @@ private: bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional<unsigned> getConstant(const MachineOperand &VL) const; bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; - bool isKnownSameDefs(Register A, Register B) const; + Register lookThruCopies(Register Reg, bool OneUseOnly = false) const; }; } // namespace @@ -387,23 +387,21 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { return true; } -bool RISCVVectorPeephole::isKnownSameDefs(Register A, Register B) const { - if (A.isPhysical() || B.isPhysical()) - return false; - - auto LookThruVirtRegCopies = [this](Register Reg) { - while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) { - if (!Def->isFullCopy()) - break; - Register Src = Def->getOperand(1).getReg(); - if (!Src.isVirtual()) - break; - Reg = Src; - } - return Reg; - }; - - return LookThruVirtRegCopies(A) == LookThruVirtRegCopies(B); +// If \p Reg is defined by one or more COPYs of virtual registers, traverses +// the chain and returns the root non-COPY source. +Register RISCVVectorPeephole::lookThruCopies(Register Reg, + bool OneUseOnly) const { + while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) { + if (!Def->isFullCopy()) + break; + Register Src = Def->getOperand(1).getReg(); + if (!Src.isVirtual()) + break; + if (OneUseOnly && !MRI->hasOneNonDBGUse(Reg)) + break; + Reg = Src; + } + return Reg; } /// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the @@ -428,10 +426,11 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { if (!TrueMaskedInfo || !hasSameEEW(MI, *True)) return false; - const MachineOperand &TrueMask = - True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()); - const MachineOperand &MIMask = MI.getOperand(4); - if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg())) + Register TrueMaskReg = lookThruCopies( + True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()) + .getReg()); + Register MIMaskReg = lookThruCopies(MI.getOperand(4).getReg()); + if (!TrueMaskReg.isVirtual() || TrueMaskReg != MIMaskReg) return false; // Masked off lanes past TrueVL will come from False, and converting to vmv @@ -455,7 +454,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { True->getOperand(1).setReg(MI.getOperand(2).getReg()); // If True is masked then its passthru needs to be in VRNoV0. MRI->constrainRegClass(True->getOperand(1).getReg(), - TII->getRegClass(True->getDesc(), 1, TRI)); + TII->getRegClass(True->getDesc(), 1)); } MI.setDesc(TII->get(NewOpc)); @@ -652,11 +651,23 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (!hasSameEEW(MI, *Src)) return false; + std::optional<std::pair<unsigned, unsigned>> NeedsCommute; + // Src needs to have the same passthru as VMV_V_V MachineOperand &SrcPassthru = Src->getOperand(Src->getNumExplicitDefs()); if (SrcPassthru.getReg().isValid() && - SrcPassthru.getReg() != Passthru.getReg()) - return false; + SrcPassthru.getReg() != Passthru.getReg()) { + // If Src's passthru != Passthru, check if it uses Passthru in another + // operand and try to commute it. + int OtherIdx = Src->findRegisterUseOperandIdx(Passthru.getReg(), TRI); + if (OtherIdx == -1) + return false; + unsigned OpIdx1 = OtherIdx; + unsigned OpIdx2 = Src->getNumExplicitDefs(); + if (!TII->findCommutedOpIndices(*Src, OpIdx1, OpIdx2)) + return false; + NeedsCommute = {OpIdx1, OpIdx2}; + } // Src VL will have already been reduced if legal (see tryToReduceVL), // so we don't need to handle a smaller source VL here. However, the @@ -669,13 +680,20 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (!ensureDominates(Passthru, *Src)) return false; + if (NeedsCommute) { + auto [OpIdx1, OpIdx2] = *NeedsCommute; + [[maybe_unused]] bool Commuted = + TII->commuteInstruction(*Src, /*NewMI=*/false, OpIdx1, OpIdx2); + assert(Commuted && "Failed to commute Src?"); + } + if (SrcPassthru.getReg() != Passthru.getReg()) { SrcPassthru.setReg(Passthru.getReg()); // If Src is masked then its passthru needs to be in VRNoV0. if (Passthru.getReg().isValid()) MRI->constrainRegClass( Passthru.getReg(), - TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI)); + TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo())); } if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) { @@ -717,9 +735,10 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMERGE_VVM) return false; - Register PassthruReg = MI.getOperand(1).getReg(); - Register FalseReg = MI.getOperand(2).getReg(); - Register TrueReg = MI.getOperand(3).getReg(); + Register PassthruReg = lookThruCopies(MI.getOperand(1).getReg()); + Register FalseReg = lookThruCopies(MI.getOperand(2).getReg()); + Register TrueReg = + lookThruCopies(MI.getOperand(3).getReg(), /*OneUseOnly=*/true); if (!TrueReg.isVirtual() || !MRI->hasOneUse(TrueReg)) return false; MachineInstr &True = *MRI->getUniqueVRegDef(TrueReg); @@ -740,16 +759,17 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { // We require that either passthru and false are the same, or that passthru // is undefined. - if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg)) + if (PassthruReg && !(PassthruReg.isVirtual() && PassthruReg == FalseReg)) return false; std::optional<std::pair<unsigned, unsigned>> NeedsCommute; // If True has a passthru operand then it needs to be the same as vmerge's // False, since False will be used for the result's passthru operand. - Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg(); + Register TruePassthru = + lookThruCopies(True.getOperand(True.getNumExplicitDefs()).getReg()); if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru && - !isKnownSameDefs(TruePassthru, FalseReg)) { + !(TruePassthru.isVirtual() && TruePassthru == FalseReg)) { // If True's passthru != False, check if it uses False in another operand // and try to commute it. int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI); @@ -837,6 +857,8 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { MRI->constrainRegClass( MO.getReg(), True.getRegClassConstraint(MO.getOperandNo(), TII, TRI)); } + // We should clear the IsKill flag since we have a new use now. + MRI->clearKillFlags(FalseReg); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/RISCV/RISCVZilsdOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVZilsdOptimizer.cpp new file mode 100644 index 0000000..3b47903 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVZilsdOptimizer.cpp @@ -0,0 +1,527 @@ +//===-- RISCVZilsdOptimizer.cpp - RISC-V Zilsd Load/Store Optimizer ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load/store optimizations for the +// RISC-V Zilsd extension. It combines pairs of 32-bit load/store instructions +// into single 64-bit LD/SD instructions when possible. +// +// The pass runs in two phases: +// 1. Pre-allocation: Reschedules loads/stores to bring consecutive memory +// accesses closer together and forms LD/SD pairs with register hints. +// 2. Post-allocation: Fixes invalid LD/SD instructions if register allocation +// didn't provide suitable consecutive registers. +// +// Note: second phase is integrated into RISCVLoadStoreOptimizer +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVRegisterInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include <algorithm> + +using namespace llvm; + +#define DEBUG_TYPE "riscv-zilsd-opt" + +STATISTIC(NumLDFormed, "Number of LD instructions formed"); +STATISTIC(NumSDFormed, "Number of SD instructions formed"); + +static cl::opt<bool> + DisableZilsdOpt("disable-riscv-zilsd-opt", cl::Hidden, cl::init(false), + cl::desc("Disable Zilsd load/store optimization")); + +static cl::opt<unsigned> MaxRescheduleDistance( + "riscv-zilsd-max-reschedule-distance", cl::Hidden, cl::init(10), + cl::desc("Maximum distance for rescheduling load/store instructions")); + +namespace { + +//===----------------------------------------------------------------------===// +// Pre-allocation Zilsd optimization pass +//===----------------------------------------------------------------------===// +class RISCVPreAllocZilsdOpt : public MachineFunctionPass { +public: + static char ID; + + RISCVPreAllocZilsdOpt() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "RISC-V pre-allocation Zilsd load/store optimization"; + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().setIsSSA(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + enum class MemoryOffsetKind { + Imm = 0, + Global = 1, + CPI = 2, + BlockAddr = 3, + Unknown = 4, + }; + using MemOffset = std::pair<MemoryOffsetKind, int>; + using BaseRegInfo = std::pair<unsigned, MemoryOffsetKind>; + +private: + bool isMemoryOp(const MachineInstr &MI); + bool rescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + bool canFormLdSdPair(MachineInstr *MI0, MachineInstr *MI1); + bool rescheduleOps(MachineBasicBlock *MBB, + SmallVectorImpl<MachineInstr *> &MIs, BaseRegInfo Base, + bool IsLoad, + DenseMap<MachineInstr *, unsigned> &MI2LocMap); + bool isSafeToMove(MachineInstr *MI, MachineInstr *Target, bool MoveForward); + MemOffset getMemoryOpOffset(const MachineInstr &MI); + + const RISCVSubtarget *STI; + const RISCVInstrInfo *TII; + const RISCVRegisterInfo *TRI; + MachineRegisterInfo *MRI; + AliasAnalysis *AA; + MachineDominatorTree *DT; + Align RequiredAlign; +}; + +} // end anonymous namespace + +char RISCVPreAllocZilsdOpt::ID = 0; + +INITIALIZE_PASS_BEGIN(RISCVPreAllocZilsdOpt, "riscv-prera-zilsd-opt", + "RISC-V pre-allocation Zilsd optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_END(RISCVPreAllocZilsdOpt, "riscv-prera-zilsd-opt", + "RISC-V pre-allocation Zilsd optimization", false, false) + +//===----------------------------------------------------------------------===// +// Pre-allocation pass implementation +//===----------------------------------------------------------------------===// + +bool RISCVPreAllocZilsdOpt::runOnMachineFunction(MachineFunction &MF) { + + if (DisableZilsdOpt || skipFunction(MF.getFunction())) + return false; + + STI = &MF.getSubtarget<RISCVSubtarget>(); + + // Only run on RV32 with Zilsd extension + if (STI->is64Bit() || !STI->hasStdExtZilsd()) + return false; + + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + MRI = &MF.getRegInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + + // Check alignment: default is 8-byte, but allow 4-byte with tune feature + // If unaligned scalar memory is enabled, allow any alignment + RequiredAlign = STI->getZilsdAlign(); + bool Modified = false; + for (auto &MBB : MF) { + Modified |= rescheduleLoadStoreInstrs(&MBB); + } + + return Modified; +} + +RISCVPreAllocZilsdOpt::MemOffset +RISCVPreAllocZilsdOpt::getMemoryOpOffset(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case RISCV::LW: + case RISCV::SW: { + // For LW/SW, the offset is in operand 2 + const MachineOperand &OffsetOp = MI.getOperand(2); + + // Handle immediate offset + if (OffsetOp.isImm()) + return std::make_pair(MemoryOffsetKind::Imm, OffsetOp.getImm()); + + // Handle symbolic operands with MO_LO flag (from MergeBaseOffset) + if (OffsetOp.getTargetFlags() & RISCVII::MO_LO) { + if (OffsetOp.isGlobal()) + return std::make_pair(MemoryOffsetKind::Global, OffsetOp.getOffset()); + if (OffsetOp.isCPI()) + return std::make_pair(MemoryOffsetKind::CPI, OffsetOp.getOffset()); + if (OffsetOp.isBlockAddress()) + return std::make_pair(MemoryOffsetKind::BlockAddr, + OffsetOp.getOffset()); + } + + break; + } + default: + break; + } + + return std::make_pair(MemoryOffsetKind::Unknown, 0); +} + +bool RISCVPreAllocZilsdOpt::canFormLdSdPair(MachineInstr *MI0, + MachineInstr *MI1) { + if (!MI0->hasOneMemOperand() || !MI1->hasOneMemOperand()) + return false; + + // Get offsets and check they are consecutive + int Offset0 = getMemoryOpOffset(*MI0).second; + int Offset1 = getMemoryOpOffset(*MI1).second; + + // Offsets must be 4 bytes apart + if (Offset1 - Offset0 != 4) + return false; + + // We need to guarantee the alignment(base + offset) is legal. + const MachineMemOperand *MMO = *MI0->memoperands_begin(); + if (MMO->getAlign() < RequiredAlign) + return false; + + // Check that the two destination/source registers are different for + // load/store respectively. + Register FirstReg = MI0->getOperand(0).getReg(); + Register SecondReg = MI1->getOperand(0).getReg(); + if (FirstReg == SecondReg) + return false; + + return true; +} + +bool RISCVPreAllocZilsdOpt::isSafeToMove(MachineInstr *MI, MachineInstr *Target, + bool MoveForward) { + MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::iterator Start = MI->getIterator(); + MachineBasicBlock::iterator End = Target->getIterator(); + + if (!MoveForward) + std::swap(Start, End); + + // Increment Start to skip the current instruction + if (Start != MBB->end()) + ++Start; + + Register DefReg = MI->getOperand(0).getReg(); + Register BaseReg = MI->getOperand(1).getReg(); + + unsigned ScanCount = 0; + for (auto It = Start; It != End; ++It, ++ScanCount) { + // Don't move across calls or terminators + if (It->isCall() || It->isTerminator()) { + LLVM_DEBUG(dbgs() << "Cannot move across call/terminator: " << *It); + return false; + } + + // Don't move across instructions that modify memory barrier + if (It->hasUnmodeledSideEffects()) { + LLVM_DEBUG(dbgs() << "Cannot move across instruction with side effects: " + << *It); + return false; + } + + // Check if the base register is modified + if (It->modifiesRegister(BaseReg, TRI)) { + LLVM_DEBUG(dbgs() << "Base register " << BaseReg + << " modified by: " << *It); + return false; + } + + // For loads, check if the loaded value is used + if (MI->mayLoad() && + (It->readsRegister(DefReg, TRI) || It->modifiesRegister(DefReg, TRI))) { + LLVM_DEBUG(dbgs() << "Destination register " << DefReg + << " used by: " << *It); + return false; + } + + // For stores, check if the stored register is modified + if (MI->mayStore() && It->modifiesRegister(DefReg, TRI)) { + LLVM_DEBUG(dbgs() << "Source register " << DefReg + << " modified by: " << *It); + return false; + } + + // Check for memory operation interference + if (It->mayLoadOrStore() && It->mayAlias(AA, *MI, /*UseTBAA*/ false)) { + LLVM_DEBUG(dbgs() << "Memory operation interference detected\n"); + return false; + } + } + + return true; +} + +bool RISCVPreAllocZilsdOpt::rescheduleOps( + MachineBasicBlock *MBB, SmallVectorImpl<MachineInstr *> &MIs, + BaseRegInfo Base, bool IsLoad, + DenseMap<MachineInstr *, unsigned> &MI2LocMap) { + // Sort by offset, at this point it ensure base reg and MemoryOffsetKind are + // same, so we just need to simply sort by offset value. + llvm::sort(MIs.begin(), MIs.end(), [this](MachineInstr *A, MachineInstr *B) { + return getMemoryOpOffset(*A).second < getMemoryOpOffset(*B).second; + }); + + bool Modified = false; + + // Try to pair consecutive operations + for (size_t i = 0; i + 1 < MIs.size(); i++) { + MachineInstr *MI0 = MIs[i]; + MachineInstr *MI1 = MIs[i + 1]; + + Register FirstReg = MI0->getOperand(0).getReg(); + Register SecondReg = MI1->getOperand(0).getReg(); + Register BaseReg = MI0->getOperand(1).getReg(); + const MachineOperand &OffsetOp = MI0->getOperand(2); + + // At this point, MI0 and MI1 are: + // 1. both either LW or SW. + // 2. guaranteed to have same memory kind. + // 3. guaranteed to have same base register. + // 4. already be sorted by offset value. + // so we don't have to check these in canFormLdSdPair. + if (!canFormLdSdPair(MI0, MI1)) + continue; + + // Use MI2LocMap to determine which instruction appears later in program + // order + bool MI1IsLater = MI2LocMap[MI1] > MI2LocMap[MI0]; + + // For loads: move later instruction up (backwards) to earlier instruction + // For stores: move earlier instruction down (forwards) to later instruction + MachineInstr *MoveInstr, *TargetInstr; + if (IsLoad) { + // For loads: move the later instruction to the earlier one + MoveInstr = MI1IsLater ? MI1 : MI0; + TargetInstr = MI1IsLater ? MI0 : MI1; + } else { + // For stores: move the earlier instruction to the later one + MoveInstr = MI1IsLater ? MI0 : MI1; + TargetInstr = MI1IsLater ? MI1 : MI0; + } + + unsigned Distance = MI1IsLater ? MI2LocMap[MI1] - MI2LocMap[MI0] + : MI2LocMap[MI0] - MI2LocMap[MI1]; + if (!isSafeToMove(MoveInstr, TargetInstr, !IsLoad) || + Distance > MaxRescheduleDistance) + continue; + + // Move the instruction to the target position + MachineBasicBlock::iterator InsertPos = TargetInstr->getIterator(); + ++InsertPos; + + // If we need to move an instruction, do it now + if (MoveInstr != TargetInstr) + MBB->splice(InsertPos, MBB, MoveInstr->getIterator()); + + // Create the paired instruction + MachineInstrBuilder MIB; + DebugLoc DL = MI0->getDebugLoc(); + + if (IsLoad) { + MIB = BuildMI(*MBB, InsertPos, DL, TII->get(RISCV::PseudoLD_RV32_OPT)) + .addReg(FirstReg, RegState::Define) + .addReg(SecondReg, RegState::Define) + .addReg(BaseReg) + .add(OffsetOp); + ++NumLDFormed; + LLVM_DEBUG(dbgs() << "Formed LD: " << *MIB << "\n"); + } else { + MIB = BuildMI(*MBB, InsertPos, DL, TII->get(RISCV::PseudoSD_RV32_OPT)) + .addReg(FirstReg) + .addReg(SecondReg) + .addReg(BaseReg) + .add(OffsetOp); + ++NumSDFormed; + LLVM_DEBUG(dbgs() << "Formed SD: " << *MIB << "\n"); + } + + // Copy memory operands + MIB.cloneMergedMemRefs({MI0, MI1}); + + // Add register allocation hints for consecutive registers + // RISC-V Zilsd requires even/odd register pairs + // Only set hints for virtual registers (physical registers already have + // encoding) + if (FirstReg.isVirtual() && SecondReg.isVirtual()) { + // For virtual registers, we can't determine even/odd yet, but we can hint + // that they should be allocated as a consecutive pair + MRI->setRegAllocationHint(FirstReg, RISCVRI::RegPairEven, SecondReg); + MRI->setRegAllocationHint(SecondReg, RISCVRI::RegPairOdd, FirstReg); + } + + // Remove the original instructions + MI0->eraseFromParent(); + MI1->eraseFromParent(); + + Modified = true; + + // Skip the next instruction since we've already processed it + i++; + } + + return Modified; +} + +bool RISCVPreAllocZilsdOpt::isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + if (Opcode != RISCV::LW && Opcode != RISCV::SW) + return false; + + if (!MI.getOperand(1).isReg()) + return false; + + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + + if (MMO->isVolatile() || MMO->isAtomic()) + return false; + + // sw <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI.getOperand(1).isUndef()) + return false; + + return true; +} + +bool RISCVPreAllocZilsdOpt::rescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { + bool Modified = false; + + // Process the basic block in windows delimited by calls, terminators, + // or instructions with duplicate base+offset pairs + MachineBasicBlock::iterator MBBI = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + + while (MBBI != E) { + // Map from instruction to its location in the current window + DenseMap<MachineInstr *, unsigned> MI2LocMap; + + // Map from base register to list of load/store instructions + using Base2InstMap = DenseMap<BaseRegInfo, SmallVector<MachineInstr *, 4>>; + using BaseVec = SmallVector<BaseRegInfo, 4>; + Base2InstMap Base2LdsMap; + Base2InstMap Base2StsMap; + BaseVec LdBases; + BaseVec StBases; + + unsigned Loc = 0; + + // Build the current window of instructions + for (; MBBI != E; ++MBBI) { + MachineInstr &MI = *MBBI; + + // Stop at barriers (calls and terminators) + if (MI.isCall() || MI.isTerminator()) { + // Move past the barrier for next iteration + ++MBBI; + break; + } + + // Track instruction location in window + if (!MI.isDebugInstr()) + MI2LocMap[&MI] = ++Loc; + + MemOffset Offset = getMemoryOpOffset(MI); + // Skip non-memory operations or it's not a valid memory offset kind. + if (!isMemoryOp(MI) || Offset.first == MemoryOffsetKind::Unknown) + continue; + + bool IsLd = (MI.getOpcode() == RISCV::LW); + Register Base = MI.getOperand(1).getReg(); + bool StopHere = false; + + // Lambda to find or add base register entries + auto FindBases = [&](Base2InstMap &Base2Ops, BaseVec &Bases) { + auto [BI, Inserted] = Base2Ops.try_emplace({Base.id(), Offset.first}); + if (Inserted) { + // First time seeing this base register + BI->second.push_back(&MI); + Bases.push_back({Base.id(), Offset.first}); + return; + } + // Check if we've seen this exact base+offset before + if (any_of(BI->second, [&](const MachineInstr *PrevMI) { + return Offset == getMemoryOpOffset(*PrevMI); + })) { + // Found duplicate base+offset - stop here to process current window + StopHere = true; + } else { + BI->second.push_back(&MI); + } + }; + + if (IsLd) + FindBases(Base2LdsMap, LdBases); + else + FindBases(Base2StsMap, StBases); + + if (StopHere) { + // Found a duplicate (a base+offset combination that's seen earlier). + // Backtrack to process the current window. + --Loc; + break; + } + } + + // Process the current window - reschedule loads + for (auto Base : LdBases) { + SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base]; + if (Lds.size() > 1) { + Modified |= rescheduleOps(MBB, Lds, Base, true, MI2LocMap); + } + } + + // Process the current window - reschedule stores + for (auto Base : StBases) { + SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base]; + if (Sts.size() > 1) { + Modified |= rescheduleOps(MBB, Sts, Base, false, MI2LocMap); + } + } + } + + return Modified; +} + +//===----------------------------------------------------------------------===// +// Pass creation functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createRISCVPreAllocZilsdOptPass() { + return new RISCVPreAllocZilsdOpt(); +} |
