diff options
Diffstat (limited to 'llvm/lib/Target')
69 files changed, 2683 insertions, 1064 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index c62582a..a99856d 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // name (emitting the definition) can grab it from the metadata. // // FIXME: Handle functions with weak linkage? - if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) { + if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional<std::string> MangledName = getArm64ECMangledFunctionName(F.getName().str())) { F.setMetadata("arm64ec_unmangled_name", diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 5b5ffd7..4fa719a 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { TS->emitDirectiveVariantPCS(CurrentFnSym); } - if (TM.getTargetTriple().isWindowsArm64EC()) { + if (TM.getTargetTriple().isWindowsArm64EC() && + !MF->getFunction().hasLocalLinkage()) { // For ARM64EC targets, a function definition's name is mangled differently // from the normal symbol. We emit the alias from the unmangled symbol to // mangled symbol name here. diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3485edb..5cc612e 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone", cl::desc("enable use of redzone on AArch64"), cl::init(false), cl::Hidden); -static cl::opt<bool> - ReverseCSRRestoreSeq("reverse-csr-restore-seq", - cl::desc("reverse the CSR restore sequence"), - cl::init(false), cl::Hidden); - static cl::opt<bool> StackTaggingMergeSetTag( "stack-tagging-merge-settag", cl::desc("merge settag instruction in function epilog"), cl::init(true), @@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; if (!EnableHomogeneousPrologEpilog) return false; - if (ReverseCSRRestoreSeq) - return false; if (EnableRedZone) return false; @@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Define); + } + return true; + } + + // For performance reasons restore SVE register in increasing order + auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); + std::reverse(PPRBegin, PPREnd); + auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); + std::reverse(ZPRBegin, ZPREnd); + + for (const RegPairInfo &RPI : RegPairs) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -3191,42 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); - - return MIB->getIterator(); - }; - - // SVE objects are always restored in reverse order. - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (RPI.isScalable()) - EmitMI(RPI); - - if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) - .setMIFlag(MachineInstr::FrameDestroy); - for (auto &RPI : RegPairs) { - MIB.addReg(RPI.Reg1, RegState::Define); - MIB.addReg(RPI.Reg2, RegState::Define); - } - return true; - } - - if (ReverseCSRRestoreSeq) { - MachineBasicBlock::iterator First = MBB.end(); - for (const RegPairInfo &RPI : reverse(RegPairs)) { - if (RPI.isScalable()) - continue; - MachineBasicBlock::iterator It = EmitMI(RPI); - if (First == MBB.end()) - First = It; - } - if (First != MBB.end()) - MBB.splice(MBBI, &MBB, First); - } else { - for (const RegPairInfo &RPI : RegPairs) { - if (RPI.isScalable()) - continue; - (void)EmitMI(RPI); - } } return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 184ebc1..3b92e95 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); @@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); - setOperationAction(ISD::BITCAST, MVT::i16, Custom); - setOperationAction(ISD::BITCAST, MVT::f16, Custom); - setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + if (Subtarget->hasFPARMv8()) { + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 436b21f..bec1348 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1308,6 +1308,8 @@ private: bool preferScalarizeSplat(SDNode *N) const override; unsigned getMinimumJumpTableEntries() const override; + + bool softPromoteHalfType() const override { return true; } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 0ae9a69..1c577a2 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; -def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 961dded..ef7c517 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackSafetyAnalysis.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto &I : SInfo.AllocasToInstrument) { memtag::AllocaInfo &Info = I.second; assert(Info.AI && SIB.isInterestingAlloca(*Info.AI)); - TrackingVH<Instruction> OldAI = Info.AI; memtag::alignAndPadAlloca(Info, kTagGranuleSize); AllocaInst *AI = Info.AI; int Tag = NextTag; @@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { ConstantInt::get(IRB.getInt64Ty(), Tag)}); if (Info.AI->hasName()) TagPCall->setName(Info.AI->getName() + ".tag"); - Info.AI->replaceAllUsesWith(TagPCall); + // Does not replace metadata, so we don't have to handle DPValues. + Info.AI->replaceNonMetadataUsesWith(TagPCall); TagPCall->setOperand(0, Info.AI); // Calls to functions that may return twice (e.g. setjmp) confuse the @@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto *II : Info.LifetimeEnd) II->eraseFromParent(); } - - // Fixup debug intrinsics to point to the new alloca. - for (auto *DVI : Info.DbgVariableIntrinsics) - DVI->replaceVariableLocationOp(OldAI, Info.AI); - for (auto *DPV : Info.DbgVariableRecords) - DPV->replaceVariableLocationOp(OldAI, Info.AI); } // If we have instrumented at least one alloca, all unrecognized lifetime diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6655931..010e569 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); + case ISD::FREM: + // Pass nullptr as fmod/fmodf calls are emitted by the backend even when + // those functions are not declared in the module. + if (!Ty->isVectorTy()) + return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b9411e2..9218760 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule< [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; +def fdiv_by_sqrt_to_rsq_f16 : GICombineRule< + (defs root:$root), + (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)), + (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root, + [{ return matchFDivSqrtToRsqF16(*${root}); }]), + (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>; def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">; @@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> { + rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 0d3b158..13d7510 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, return true; } -static const unsigned SPDenormModeBitField = - AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); +static constexpr unsigned SPDenormModeBitField = + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2); // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions // to enable denorm mode. When 'Enable' is false, disable denorm mode. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index a1c34e9..82e17dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -83,6 +83,9 @@ public: matchRcpSqrtToRsq(MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) const; + bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; + void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( return false; } +bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( + MachineInstr &MI) const { + Register Sqrt = MI.getOperand(2).getReg(); + return MRI.hasOneNonDBGUse(Sqrt); +} + +void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( + MachineInstr &MI, const Register &X) const { + Register Dst = MI.getOperand(0).getReg(); + Register Y = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + uint32_t Flags = MI.getFlags(); + Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy}) + .addUse(X) + .setMIFlags(Flags) + .getReg(0); + B.buildFMul(Dst, RSQ, Y, Flags); + MI.eraseFromParent(); +} + bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { Register SrcReg = MI.getOperand(1).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5b32b34..b7b471d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { if (trySkipId("hwreg", AsmToken::LParen)) { OperandInfoTy HwReg(OPR_ID_UNKNOWN); - OperandInfoTy Offset(OFFSET_DEFAULT_); - OperandInfoTy Width(WIDTH_DEFAULT_); + OperandInfoTy Offset(HwregOffset::Default); + OperandInfoTy Width(HwregSize::Default); if (parseHwregBody(HwReg, Offset, Width) && validateHwreg(HwReg, Offset, Width)) { - ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id); + ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id); } else { return ParseStatus::Failure; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 894607d..e1cca17 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); } +static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr, + const MCDisassembler *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeDpp8FI(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ uint64_t /*Addr*/, \ @@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) { return DecoderUInt128(Lo, Hi); } -// The disassembler is greedy, so we need to check FI operand value to -// not parse a dpp if the correct literal is not set. For dpp16 the -// autogenerated decoder checks the dpp literal -static bool isValidDPP8(const MCInst &MI) { - using namespace llvm::AMDGPU::DPP; - int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); - assert(FiIdx != -1); - if ((unsigned)FiIdx >= MI.getNumOperands()) - return false; - unsigned Fi = MI.getOperand(FiIdx).getImm(); - return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; -} - DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes_, uint64_t Address, @@ -460,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); - DecodeStatus Res = MCDisassembler::Fail; + // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless + // there are fewer bytes left). This will be overridden on success. + Size = std::min((size_t)4, Bytes_.size()); + do { // ToDo: better to switch encoding length using some bit predicate // but it is unknown yet, so try all we can @@ -469,222 +465,147 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); - Res = - tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696, - MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - Res = - tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - const auto convertVOPDPP = [&]() { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) { - convertVOPCDPPInst(MI); // Special VOP3 case - } else { - assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); - convertVOP3DPPInst(MI); // Regular VOP3 case - } - }; - Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696, - MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); - break; - } - Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); - break; - } - Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); - if (Res) + + if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI, + DecW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI, + DecW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS)) break; } + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); - if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { - Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS); - if (Res) { - if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) - == -1) - break; - if (convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - } - } - - Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - Res = tryDecodeInst(DecoderTableDPP8GFX1164, - DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - Res = tryDecodeInst(DecoderTableDPP8GFX1264, - DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) && + tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS)) break; - MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); - if (Res) break; - - Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664, - MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); - break; - } - - Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664, - MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); + if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) && + tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS)) break; - } - - if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) { - Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS); - if (Res) - break; - } // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and // v_mad_mixhi_f16 for FMA variants. Try to decode using this special // table first so we print the correct name. - if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) { - Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) && + tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) { - Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) && + tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { - Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) && + tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, - QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, + Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, - QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, + Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS)) break; } - // Reinitialize Bytes as DPP64 could have eaten too much + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); // Try decode 32-bit instruction - if (Bytes.size() < 4) break; - const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS); - if (Res) break; + if (Bytes.size() >= 4) { + const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { - Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS)) break; - } - if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { - Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS); - if (Res) break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) && + tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS)) + break; + + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) && + tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW, - Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW, + Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, - Address, CS); + if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, + Address, CS)) + break; + } + + return MCDisassembler::Fail; } while (false); - if (Res && AMDGPU::isMAC(MI.getOpcode())) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) { + if (isMacDPP(MI)) + convertMacDPPInst(MI); + + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + convertVOP3PDPPInst(MI); + else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) || + AMDGPU::isVOPC64DPP(MI.getOpcode())) + convertVOPCDPPInst(MI); // Special VOP3 case + else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) != + -1) + convertDPP8Inst(MI); + else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) + convertVOP3DPPInst(MI); // Regular VOP3 case + } + + if (AMDGPU::isMAC(MI.getOpcode())) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } - if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || - MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && + if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) { + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) { int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cpol); if (CPolPos != -1) { @@ -700,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && - (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) { + if ((MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && + (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) { // GFX90A lost TFE, its place is occupied by ACC. int TFEOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); @@ -713,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) { + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) { int SWZOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); if (SWZOpIdx != -1) { @@ -724,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int RsrcIdx = @@ -732,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1; if (VAddr0Idx >= 0 && NSAArgs > 0) { unsigned NSAWords = (NSAArgs + 3) / 4; - if (Bytes.size() < 4 * NSAWords) { - Res = MCDisassembler::Fail; - } else { - for (unsigned i = 0; i < NSAArgs; ++i) { - const unsigned VAddrIdx = VAddr0Idx + 1 + i; - auto VAddrRCID = - MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; - MI.insert(MI.begin() + VAddrIdx, - createRegOperand(VAddrRCID, Bytes[i])); - } - Bytes = Bytes.slice(4 * NSAWords); + if (Bytes.size() < 4 * NSAWords) + return MCDisassembler::Fail; + for (unsigned i = 0; i < NSAArgs; ++i) { + const unsigned VAddrIdx = VAddr0Idx + 1 + i; + auto VAddrRCID = + MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; + MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i])); } + Bytes = Bytes.slice(4 * NSAWords); } - if (Res) - Res = convertMIMGInst(MI); + convertMIMGInst(MI); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))) - Res = convertMIMGInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)) + convertMIMGInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)) - Res = convertEXPInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP) + convertEXPInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)) - Res = convertVINTERPInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP) + convertVINTERPInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)) - Res = convertSDWAInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA) + convertSDWAInst(MI); int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); @@ -782,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, int ImmLitIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm); bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK; - if (Res && ImmLitIdx != -1 && !IsSOPK) - Res = convertFMAanyK(MI, ImmLitIdx); + if (ImmLitIdx != -1 && !IsSOPK) + convertFMAanyK(MI, ImmLitIdx); - // if the opcode was not recognized we'll assume a Size of 4 bytes - // (unless there are fewer bytes left) - Size = Res ? (MaxInstBytesNum - Bytes.size()) - : std::min((size_t)4, Bytes_.size()); - return Res; + Size = MaxInstBytesNum - Bytes.size(); + return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) { // The MCInst still has these fields even though they are no longer encoded // in the GFX11 instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm); insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr); } - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || @@ -815,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { // instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); } - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { +void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { if (STI.hasFeature(AMDGPU::FeatureGFX9) || STI.hasFeature(AMDGPU::FeatureGFX10)) { if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst)) @@ -835,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); } } - return MCDisassembler::Success; } struct VOPModifiers { @@ -939,56 +850,40 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { AMDGPU::OpName::src2_modifiers); } -// We must check FI == literal to reject not genuine dpp8 insts, and we must -// first add optional MI operands to check FI -DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { +void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); - if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || - AMDGPU::isVOPC64DPP(Opc)) { - convertVOPCDPPInst(MI); - } else { - if (isMacDPP(MI)) - convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); - int VDstInIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); - if (VDstInIdx != -1) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); - if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || - MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + convertTrue16OpSel(MI); + auto Mods = collectVOPModifiers(MI); + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), + AMDGPU::OpName::op_sel); + } else { + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); - unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { - convertTrue16OpSel(MI); - auto Mods = collectVOPModifiers(MI); - insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), - AMDGPU::OpName::op_sel); - } else { - // Insert dummy unused src modifiers. - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src0_modifiers); - - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src1_modifiers); - } + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); } - return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; } -DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { - if (isMacDPP(MI)) - convertMacDPPInst(MI); - +void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { convertTrue16OpSel(MI); int VDstInIdx = @@ -1008,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), AMDGPU::OpName::op_sel); } - return MCDisassembler::Success; } // Note that before gfx10, the MIMG encoding provided no information about // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. -DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { +void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { auto TSFlags = MCII->get(MI.getOpcode()).TSFlags; int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), @@ -1043,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (BaseOpcode->BVH) { // Add A16 operand for intersect_ray instructions addOperand(MI, MCOperand::createImm(BaseOpcode->A16)); - return MCDisassembler::Success; + return; } bool IsAtomic = (VDstIdx != -1); @@ -1078,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) { // The NSA encoding does not contain enough operands for the // combination of base opcode / dimension. Should this be an error? - return MCDisassembler::Success; + return; } IsPartialNSA = true; } @@ -1097,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { DstSize += 1; if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) - return MCDisassembler::Success; + return; int NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize); if (NewOpcode == -1) - return MCDisassembler::Success; + return; // Widen the register to the correct number of enabled channels. unsigned NewVdata = AMDGPU::NoRegister; @@ -1119,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (NewVdata == AMDGPU::NoRegister) { // It's possible to encode this such that the low register + enabled // components exceeds the register count. - return MCDisassembler::Success; + return; } } @@ -1137,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &MRI.getRegClass(AddrRCID)); if (!NewVAddrSA) - return MCDisassembler::Success; + return; } MI.setOpcode(NewOpcode); @@ -1158,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MI.erase(MI.begin() + VAddr0Idx + AddrSize, MI.begin() + VAddr0Idx + Info->VAddrDwords); } - - return MCDisassembler::Success; } // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen // decoder only adds to src_modifiers, so manually add the bits to the other // operands. -DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); auto Mods = collectVOPModifiers(MI, true); @@ -1190,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi)) insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi), AMDGPU::OpName::neg_hi); - - return MCDisassembler::Success; } // Create dummy old operand and insert optional operands -DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); @@ -1212,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src1_modifiers); - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, - int ImmLitIdx) const { +void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const { assert(HasLiteral && "Should have decoded a literal"); const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); unsigned DescNumOps = Desc.getNumOperands(); @@ -1232,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, IsDeferredOp) Op.setImm(Literal); } - return MCDisassembler::Success; } const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { @@ -1831,6 +1718,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { return decodeSrcOp(OPW32, Val); } +MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { + if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1) + return MCOperand(); + return MCOperand::createImm(Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3142b8a..2e1b6fb 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -194,15 +194,15 @@ public: DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer, raw_string_ostream &KdStream) const; - DecodeStatus convertEXPInst(MCInst &MI) const; - DecodeStatus convertVINTERPInst(MCInst &MI) const; - DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const; - DecodeStatus convertSDWAInst(MCInst &MI) const; - DecodeStatus convertDPP8Inst(MCInst &MI) const; - DecodeStatus convertMIMGInst(MCInst &MI) const; - DecodeStatus convertVOP3DPPInst(MCInst &MI) const; - DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; - DecodeStatus convertVOPCDPPInst(MCInst &MI) const; + void convertEXPInst(MCInst &MI) const; + void convertVINTERPInst(MCInst &MI) const; + void convertFMAanyK(MCInst &MI, int ImmLitIdx) const; + void convertSDWAInst(MCInst &MI) const; + void convertDPP8Inst(MCInst &MI) const; + void convertMIMGInst(MCInst &MI) const; + void convertVOP3DPPInst(MCInst &MI) const; + void convertVOP3PDPPInst(MCInst &MI) const; + void convertVOPCDPPInst(MCInst &MI) const; void convertMacDPPInst(MCInst &MI) const; void convertTrue16OpSel(MCInst &MI) const; @@ -261,6 +261,7 @@ public: MCOperand decodeBoolReg(unsigned Val) const; MCOperand decodeSplitBarrier(unsigned Val) const; + MCOperand decodeDpp8FI(unsigned Val) const; int getTTmpIdx(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a727134..00fa93c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) { static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); - return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; + return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); } ScheduleHazardRecognizer::HazardType diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a45fea6..a32be1e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Id; - unsigned Offset; - unsigned Width; - using namespace llvm::AMDGPU::Hwreg; unsigned Val = MI->getOperand(OpNo).getImm(); - decodeHwreg(Val, Id, Offset, Width); + auto [Id, Offset, Width] = HwregEncoding::decode(Val); StringRef HwRegName = getHwreg(Id, STI); O << "hwreg("; @@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, } else { O << Id; } - if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) { + if (Width != HwregSize::Default || Offset != HwregOffset::Default) O << ", " << Offset << ", " << Width; - } O << ')'; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 98310c3..0b516bf 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0] ID_SQ_PERF_SNAPSHOT_DATA1 = 22, ID_SQ_PERF_SNAPSHOT_PC_LO = 23, ID_SQ_PERF_SNAPSHOT_PC_HI = 24, - - ID_SHIFT_ = 0, - ID_WIDTH_ = 6, - ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) }; enum Offset : unsigned { // Offset, (5) [10:6] - OFFSET_DEFAULT_ = 0, - OFFSET_SHIFT_ = 6, - OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), - OFFSET_MEM_VIOL = 8, }; -enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] - WIDTH_M1_DEFAULT_ = 31, - WIDTH_M1_SHIFT_ = 11, - WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), -}; - -// Some values from WidthMinusOne mapped into Width domain. -enum Width : unsigned { - WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1, -}; - enum ModeRegisterMasks : uint32_t { FP_ROUND_MASK = 0xf << 0, // Bits 0..3 FP_DENORM_MASK = 0xf << 4, // Bits 4..7 diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index d02aee7..4f106bf 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( .addImm(0); Addc->getOperand(3).setIsDead(); // Mark SCC as dead. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). - addReg(FlatScrInitLo). - addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | - (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). - addReg(FlatScrInitHi). - addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | - (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + using namespace AMDGPU::Hwreg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) + .addReg(FlatScrInitLo) + .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32))); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) + .addReg(FlatScrInitHi) + .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32))); return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 257dff6..d8f528d8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, assert(Op.getValueType() == MVT::i32); uint32_t BothRoundHwReg = - AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); SDValue IntrinID = @@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock::iterator I = LoopBB->end(); - const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( - AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); // Clear TRAP_STS.MEM_VIOL BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) @@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // Otherwise there was overflow and the result is hi2:0. In both cases the // result should represent the actual time at some point during the sequence // of three getregs. + using namespace AMDGPU::Hwreg; Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) - .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, - 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) - .addImm( - AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) - .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, - 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(RegHi1) .addReg(RegHi2); @@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // FIXME: This could be predicates on the immediate, but tablegen doesn't // allow you to have a no side effect instruction in the output of a // sideeffecting pattern. - unsigned ID, Offset, Width; - AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); + auto [ID, Offset, Width] = + AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); if (ID != AMDGPU::Hwreg::ID_MODE) return BB; @@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags); - const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | - (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + using namespace AMDGPU::Hwreg; + const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); const MachineFunction &MF = DAG.getMachineFunction(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6ecb1c8..a6184c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -480,6 +480,10 @@ public: // WaitEventType to corresponding counter values in InstCounterType. virtual const unsigned *getWaitEventMask() const = 0; + // Returns a new waitcnt with all counters except VScnt set to 0. If + // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; + virtual ~WaitcntGenerator() = default; }; @@ -516,6 +520,8 @@ public: return WaitEventMaskForInstPreGFX12; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { @@ -549,6 +555,8 @@ public: return WaitEventMaskForInstGFX12Plus; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( return Modified; } +AMDGPU::Waitcnt +WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); +} + +AMDGPU::Waitcnt +WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); +} + /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that /// were added by previous passes. Currently this pass conservatively @@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly @@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); + Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); if (ForceEmitWaitcnt[LOAD_CNT]) Wait.LoadCnt = 0; @@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything ScoreBrackets->applyWaitcnt( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else { // May need to way wait for anything. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 97c7237..34cdb09 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC { } class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1, - string ConvertMethod = "nullptr"> - : CustomOperand<Type, Optional, NAME> { + string name = NAME, string ConvertMethod = "nullptr"> + : CustomOperand<Type, Optional, name> { let ParserMethod = "[this](OperandVector &Operands) -> ParseStatus { "# "return parseIntWithPrefix(\""#Prefix#"\", Operands, "# @@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in { def DppRowMask : NamedIntOperand<i32, "row_mask">; def DppBankMask : NamedIntOperand<i32, "bank_mask">; } -def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, +def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl", "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">; -def DppFI : NamedIntOperand<i32, "fi">; + +let DecoderMethod = "decodeDpp8FI" in +def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">; +def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">; def blgp : CustomOperand<i32, 1, "BLGP">; def CBSZ : NamedIntOperand<i32, "cbsz">; @@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, @@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> { @@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } // Ins for SDWA diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index e62ad02..c01b126 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask); unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset); unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + using namespace AMDGPU::Hwreg; BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32)) .addImm(Value) - .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | - (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + .addImm(HwregEncoding::encode(ID_MODE, Offset, Width)); ++NumSetregInserted; Changed = true; InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); @@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, // as we assume it has been inserted by a higher authority (this is // likely to be a very rare occurrence). unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); - if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != - AMDGPU::Hwreg::ID_MODE) + using namespace AMDGPU::Hwreg; + auto [Id, Offset, Width] = HwregEncoding::decode(Dst); + if (Id != ID_MODE) continue; - unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> - AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + - 1; - unsigned Offset = - (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset; // If an InsertionPoint is set we will insert a setreg there. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index dacdf7b..ce91e05 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) { return (Idx < 0) ? Idx : Opr[Idx].Encoding; } -bool isValidHwreg(int64_t Id) { - return 0 <= Id && isUInt<ID_WIDTH_>(Id); -} +bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); } bool isValidHwregOffset(int64_t Offset) { - return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset); + return 0 <= Offset && isUInt<HwregOffset::Width>(Offset); } bool isValidHwregWidth(int64_t Width) { - return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1); -} - -uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { - return (Id << ID_SHIFT_) | - (Offset << OFFSET_SHIFT_) | - ((Width - 1) << WIDTH_M1_SHIFT_); + return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1); } StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { @@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { return (Idx < 0) ? "" : Opr[Idx].Name; } -void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) { - Id = (Val & ID_MASK_) >> ID_SHIFT_; - Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_; - Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; -} - } // namespace Hwreg //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f35e7744..6826cd2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs, } // end namespace IsaInfo +// Represents a field in an encoded value. +template <unsigned HighBit, unsigned LowBit, unsigned D = 0> +struct EncodingField { + static_assert(HighBit >= LowBit, "Invalid bit range!"); + static constexpr unsigned Offset = LowBit; + static constexpr unsigned Width = HighBit - LowBit + 1; + + using ValueType = unsigned; + static constexpr ValueType Default = D; + + ValueType Value; + constexpr EncodingField(ValueType Value) : Value(Value) {} + + constexpr uint64_t encode() const { return Value; } + static ValueType decode(uint64_t Encoded) { return Encoded; } +}; + +// A helper for encoding and decoding multiple fields. +template <typename... Fields> struct EncodingFields { + static constexpr uint64_t encode(Fields... Values) { + return ((Values.encode() << Values.Offset) | ...); + } + + static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) { + return {Fields::decode((Encoded >> Fields::Offset) & + maxUIntN(Fields::Width))...}; + } +}; + LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); @@ -870,15 +899,6 @@ struct Waitcnt { : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} - static Waitcnt allZero(bool Extended, bool HasStorecnt) { - return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0) - : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u); - } - - static Waitcnt allZeroExceptVsCnt(bool Extended) { - return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u); - } - bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } bool hasWaitExceptStoreCnt() const { @@ -1030,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); namespace Hwreg { +using HwregId = EncodingField<5, 0>; +using HwregOffset = EncodingField<10, 6>; + +struct HwregSize : EncodingField<15, 11, 32> { + using EncodingField::EncodingField; + constexpr uint64_t encode() const { return Value - 1; } + static ValueType decode(uint64_t Encoded) { return Encoded + 1; } +}; + +using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>; + LLVM_READONLY int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI); @@ -1043,13 +1074,8 @@ LLVM_READNONE bool isValidHwregWidth(int64_t Width); LLVM_READNONE -uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width); - -LLVM_READNONE StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI); -void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width); - } // namespace Hwreg namespace DepCtr { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 99f8e8e..f5424cf 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un let OutsDPP = (outs Src0RC32:$vdst); let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; - let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi); + let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi); let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret; let OutsVOP3DPP = (outs Src0RC64:$vdst); @@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP16 <op, ps, Gen.Subtarget, p> { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } //===----------------------------------------------------------------------===// @@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> { defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16, - DecoderNamespace = "DPP" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp<Gen, op, opName>; } @@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> { defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8, - DecoderNamespace = "DPP8" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp8<Gen, op, opName>; } @@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" @@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31-25} = 0x3f; //encoding } -multiclass VOP1Only_Real_vi <bits<10> op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { +let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1Only_Real_vi <bits<10> op> { def _vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; } -} -multiclass VOP1_Real_e32e64_vi <bits<10> op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1_Real_e32e64_vi <bits<10> op> { def _e32_vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; @@ -1389,44 +1385,41 @@ def : GCNPat < // GFX9 //===----------------------------------------------------------------------===// -multiclass VOP1_Real_gfx9 <bits<10> op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { + multiclass VOP1_Real_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; - } - - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; - -} -multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { - defm NAME : VOP1_Real_e32e64_vi <op>; + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; } - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { - let Inst{42-40} = 6; - } + multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { + defm NAME : VOP1_Real_e32e64_vi <op>; - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + let Inst{42-40} = 6; + } + + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + } } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; -let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in +let AssemblerPredicate = isGFX940Plus in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; let OtherPredicates = [HasFP8ConversionInsts] in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 4437d5f..13fe79b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v getVregSrcForVT<Src2VT>.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret; @@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT<Src2VT>.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, getVregSrcForVT<Src2VT>.ret:$src2, // stub argument @@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> { let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src2Mod = FP32InputMods; // dummy unused modifiers let Src2RC64 = VGPRSrc_32; // stub argument } @@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/ Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let OutsVOP3DPP = Outs64; @@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1> Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let HasExt = 1; let HasExtDPP = 1; @@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> { FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, FPVRegInputMods:$src0_modifiers, Src0DPP:$src0, FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src0ModVOP3DPP = FPVRegInputMods; let Src1ModVOP3DPP = FPVRegInputMods; @@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen, VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen, VOP2_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } //===------------------------- VOP2 (with name) -------------------------===// @@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"; } } @@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); - let DecoderNamespace = "DPP8"; } if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_w32_gfx10 : @@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> { VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } -let AssemblerPredicate = isGFX8Only in { +let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in { multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> { def _e32_vi : @@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : @@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; } } -} -let AssemblerPredicate = isGFX9Only in { +} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" + +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { def _e32_gfx9 : @@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } def _e64_gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : @@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } } multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { def _e32_gfx9 : VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{ - let DecoderNamespace = "GFX9"; - } + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; def _e64_gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, - VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - let DecoderNamespace = "GFX9"; - } + VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, @@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { - let DecoderNamespace = "GFX9"; - } + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } -} // AssemblerPredicate = isGFX9Only +} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 396ae9c..7198a40 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, - VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasExtVOP3DPP = 1; @@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, - op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 74f451b..ac3c8f9 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>, let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, - neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi); + neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi); let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, - DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); } multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> { @@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget, let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> @@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> let SchedRW = ps.SchedRW; let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } //===----------------------------------------------------------------------===// @@ -1486,7 +1488,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), Gen.Subtarget> { let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1496,7 +1498,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> { let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1613,7 +1615,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { - let SubtargetPredicate = isGFX940Plus, + let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index fe52a0e..e5e8244 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, let AsmVariantName = AMDGPUAsmVariants.Default; let SubtargetPredicate = AssemblerPredicate; + + string DecoderNamespace; // dummy } multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> { @@ -766,7 +768,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP16 = AsmDPP#"$fi"; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); @@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> //===----------------------------------------------------------------------===// multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>, - VOPCe<op{7-0}>; - def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>, + VOPCe<op{7-0}>; + def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>; - def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>; + def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>; - def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>; + def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; - def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; + def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - // 32 and 64 bit forms of the instruction have _e32 and _e64 - // respectively appended to their assembly mnemonic. - // _e64 is printed as part of the VOPDstS64orS32 operand, whereas - // the destination-less 32bit forms add it to the asmString here. - VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">, - VOPCe<op{7-0}>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic, - pseudo_mnemonic), - asm_name, ps32.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>; - def _e64#Gen.Suffix : - VOP3_Real<ps64, Gen.Subtarget, asm_name>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic, - pseudo_mnemonic), - asm_name, ps64.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + // 32 and 64 bit forms of the instruction have _e32 and _e64 + // respectively appended to their assembly mnemonic. + // _e64 is printed as part of the VOPDstS64orS32 operand, whereas + // the destination-less 32bit forms add it to the asmString here. + VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">, + VOPCe<op{7-0}>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic, + pseudo_mnemonic), + asm_name, ps32.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>; + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget, asm_name>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic, + pseudo_mnemonic), + asm_name, ps64.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, - Gen.Subtarget, asm_name>; - def _e32_dpp_w32#Gen.Suffix - : VOPC_DPP16<op{7-0}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix - : VOPC_DPP16<op{7-0}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; + def _e32_dpp_w32#Gen.Suffix + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; - def _e32_dpp8_w32#Gen.Suffix - : VOPC_DPP8<op{7-0}, ps32, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix - : VOPC_DPP8<op{7-0}, ps32, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; + def _e32_dpp8_w32#Gen.Suffix + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; - def _e64_dpp8_w32#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; + def _e64_dpp8_w32#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name, @@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name, VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>; multiclass VOPCX_Real<GFXGen Gen, bits<9> op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - VOPC_Real<ps32, Gen.Subtarget>, - VOPCe<op{7-0}> { - let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) - # " " # ps32.AsmOperands; - } - def _e64#Gen.Suffix : - VOP3_Real<ps64, Gen.Subtarget>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", ps64.Mnemonic) - # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + VOPC_Real<ps32, Gen.Subtarget>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) + # " " # ps32.AsmOperands; + } + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", ps64.Mnemonic) + # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix - : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> { - let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; - } + def _e32_dpp#Gen.Suffix + : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { - let AsmString = !subst("_nosdst", "", psDPP.OpName) - # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) + # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { - let AsmString = !subst("_nosdst", "", ps64.OpName) - # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { + let AsmString = !subst("_nosdst", "", ps64.OpName) + # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix - : VOPC_Real<ps32, Gen.Subtarget, asm_name>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic), - pseudo_mnemonic), - asm_name, ps32.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>, - VOPCe<op{7-0}> { - let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; - } - def _e64#Gen.Suffix - : VOP3_Real<ps64, Gen.Subtarget, asm_name>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic), - pseudo_mnemonic), - asm_name, ps64.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ? ; // sdst - let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix + : VOPC_Real<ps32, Gen.Subtarget, asm_name>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic), + pseudo_mnemonic), + asm_name, ps32.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>, + VOPCe<op{7-0}> { + let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; + } + def _e64#Gen.Suffix + : VOP3_Real<ps64, Gen.Subtarget, asm_name>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic), + pseudo_mnemonic), + asm_name, ps64.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ? ; // sdst + let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp"); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, - Gen.Subtarget, asm_name>; - } - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name, @@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Only in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass VOPC_Real_gfx10<bits<9> op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, - VOPCe<op{7-0}>; - def _e64_gfx10 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}>; + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in { } multiclass VOPCX_Real_gfx10<bits<9> op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, - VOPCe<op{7-0}> { - let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr) - # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands; - } - - def _e64_gfx10 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) - # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr) + # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands; + } + + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) + # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; + } if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in { defm : VOPCXInstAliases<NAME, "gfx10">; } -} // End AssemblerPredicate = isGFX10Only +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; @@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>; // GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX6GFX7 in { +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { multiclass VOPC_Real_gfx6_gfx7<bits<9> op> { - let DecoderNamespace = "GFX6GFX7" in { - def _e32_gfx6_gfx7 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, - VOPCe<op{7-0}>; - def _e64_gfx6_gfx7 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX6GFX7" + def _e32_gfx6_gfx7 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe<op{7-0}>; + def _e64_gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<NAME, "gfx6_gfx7">; } -} // End AssemblerPredicate = isGFX6GFX7 +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> : VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 801afab..80d7d96 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let VALU = 1; let DPP = 1; let Size = 8; + let IsPacked = P.IsPacked; let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); @@ -835,7 +836,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); - let DecoderNamespace = "DPP"; + let DecoderNamespace = "GFX8"; VOPProfile Pfl = P; } @@ -906,7 +907,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); - let DecoderNamespace = "DPP"; + let DecoderNamespace = "GFX8"; } class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, @@ -1350,7 +1351,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen, VOP3_DPP16 <op, ps, Gen.Subtarget, opName> { let AssemblerPredicate = Gen.AssemblerPredicate; let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1463,7 +1464,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName, multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1473,7 +1474,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { let Inst{11} = ?; let Inst{12} = ?; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1482,7 +1483,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, - DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"), True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate) in { @@ -1505,7 +1506,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName, defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp"); def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>, SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> { - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1514,7 +1515,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index c5199aab..00a29f8 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -25,42 +25,6 @@ using namespace llvm; using namespace LegalizeActions; -/// FIXME: The following static functions are SizeChangeStrategy functions -/// that are meant to temporarily mimic the behaviour of the old legalization -/// based on doubling/halving non-legal types as closely as possible. This is -/// not entirly possible as only legalizing the types that are exactly a power -/// of 2 times the size of the legal types would require specifying all those -/// sizes explicitly. -/// In practice, not specifying those isn't a problem, and the below functions -/// should disappear quickly as we add support for legalizing non-power-of-2 -/// sized types further. -static void addAndInterleaveWithUnsupported( - LegacyLegalizerInfo::SizeAndActionsVec &result, - const LegacyLegalizerInfo::SizeAndActionsVec &v) { - for (unsigned i = 0; i < v.size(); ++i) { - result.push_back(v[i]); - if (i + 1 < v[i].first && i + 1 < v.size() && - v[i + 1].first != v[i].first + 1) - result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported}); - } -} - -static LegacyLegalizerInfo::SizeAndActionsVec -widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) { - assert(v.size() >= 1); - assert(v[0].first > 17); - LegacyLegalizerInfo::SizeAndActionsVec result = { - {1, LegacyLegalizeActions::Unsupported}, - {8, LegacyLegalizeActions::WidenScalar}, - {9, LegacyLegalizeActions::Unsupported}, - {16, LegacyLegalizeActions::WidenScalar}, - {17, LegacyLegalizeActions::Unsupported}}; - addAndInterleaveWithUnsupported(result, v); - auto Largest = result.back().first; - result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported}); - return result; -} - static bool AEABI(const ARMSubtarget &ST) { return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI(); } @@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { .libcallFor({s32}) .clampScalar(0, s32, s32); - for (unsigned Op : {G_SREM, G_UREM}) { - LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16); - if (HasHWDivide) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower); - else if (AEABI(ST)) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom); - else - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall); - } + auto &REMBuilder = + getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32); + if (HasHWDivide) + REMBuilder.lowerFor({s32}); + else if (AEABI(ST)) + REMBuilder.customFor({s32}); + else + REMBuilder.libcallFor({s32}); getActionDefinitionsBuilder(G_INTTOPTR) .legalFor({{p0, s32}}) @@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { LoadStoreBuilder.maxScalar(0, s32); - for (auto Ty : {s32, s64}) - LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower); + getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64}); getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64}); diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 5215813..8a3454c 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">; def UnaryFloatCategory : DXILOpCategory<"Unary float">; def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">; -// Following are the scalar types supported by DXIL operations and are synonymous -// to llvm_*_ty defined for readability and ease of use in the context of this file. - -def voidTy : LLVMType<isVoid>; - -// Floating point types -def f16Ty : LLVMType<f16>; -def f32Ty : LLVMType<f32>; -def f64Ty : LLVMType<f64>; - -// Integer types -def i1Ty : LLVMType<i1>; -def i8Ty : LLVMType<i8>; -def i16Ty : LLVMType<i16>; -def i32Ty : LLVMType<i32>; -def i64Ty : LLVMType<i64>; +// Represent as any pointer type with an option to change to a qualified pointer +// type with address space specified. +def dxil_handle_ty : LLVMAnyPointerType; +def dxil_cbuffer_ty : LLVMAnyPointerType; +def dxil_resource_ty : LLVMAnyPointerType; // The parameter description for a DXIL operation -class DXILOpParameter<int pos, string type, string name, string doc, +class DXILOpParameter<int pos, LLVMType type, string name, string doc, bit isConstant = 0, string enumName = "", int maxValue = 0> { int Pos = pos; // Position in parameter list - string Type = type; // LLVM type name, $o for overload, $r for resource - // type, $cb for legacy cbuffer, $u4 for u4 struct + LLVMType ParamType = type; // Parameter type string Name = name; // Short, unique parameter name string Doc = doc; // Description of this parameter bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR @@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; } def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.", - [f16Ty,f32Ty], ReadNone, + [llvm_half_ty, llvm_float_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "value", "input value"> + DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value"> ], ["floats"]>, LLVMIntrinsic<int_sin>; -def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", - [i16Ty,i32Ty,i64Ty], ReadNone, +def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", + [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "a", "input value">, - DXILOpParameter<3, "$o", "b", "input value"> + DXILOpParameter<0, llvm_anyint_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyint_ty, "a", "input value">, + DXILOpParameter<3, llvm_anyint_ty, "b", "input value"> ], ["uints"]>, LLVMIntrinsic<int_umax>; -def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty], ReadNone, +def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic<int_dx_thread_id>; -def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty], ReadNone, +def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "group ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read"> + DXILOpParameter<0, llvm_i32_ty, "", "group ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read"> ]>, LLVMIntrinsic<int_dx_group_id>; -def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, - "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty], ReadNone, +def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, + "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID in group component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic<int_dx_thread_id_in_group>; -def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, - "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty], ReadNone, +def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, + "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode"> + DXILOpParameter<0, llvm_i32_ty, "", "result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode"> ]>, LLVMIntrinsic<int_dx_flattened_thread_id_in_group>; diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 76f99b4..2870f0b 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen HexagonFrameLowering.cpp HexagonGenExtract.cpp HexagonGenInsert.cpp + HexagonGenMemAbsolute.cpp HexagonGenMux.cpp HexagonGenPredicate.cpp HexagonHardwareLoops.cpp @@ -50,6 +51,7 @@ add_llvm_target(HexagonCodeGen HexagonOptAddrMode.cpp HexagonOptimizeSZextends.cpp HexagonPeephole.cpp + HexagonPostIncOpt.cpp HexagonRDFOpt.cpp HexagonRegisterInfo.cpp HexagonSelectionDAGInfo.cpp @@ -60,6 +62,7 @@ add_llvm_target(HexagonCodeGen HexagonTargetMachine.cpp HexagonTargetObjectFile.cpp HexagonTargetTransformInfo.cpp + HexagonTfrCleanup.cpp HexagonVectorCombine.cpp HexagonVectorLoopCarriedReuse.cpp HexagonVectorPrint.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 6024d9f..3b8234c 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) { return false; const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); RegHalf H; - if (!matchHalf(0, RC, 0, H)) + unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0; + if (!matchHalf(0, RC, B, H)) return false; if (H.Low) return false; diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp new file mode 100644 index 0000000..afd4963 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -0,0 +1,274 @@ +//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This pass traverses through all the basic blocks in a function and converts +// an indexed load/store with offset "0" to a absolute-set load/store +// instruction as long as the use of the register in the new instruction +// dominates the rest of the uses and there are more than 2 uses. + +#include "HexagonTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "hexagon-abs" + +using namespace llvm; + +STATISTIC(HexagonNumLoadAbsConversions, + "Number of Load instructions converted to absolute-set form"); +STATISTIC(HexagonNumStoreAbsConversions, + "Number of Store instructions converted to absolute-set form"); + +namespace llvm { +FunctionPass *createHexagonGenMemAbsolute(); +void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry); +} // namespace llvm + +namespace { + +class HexagonGenMemAbsolute : public MachineFunctionPass { + const HexagonInstrInfo *TII; + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + +public: + static char ID; + HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) { + initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Hexagon Generate Load/Store Set Absolute Address Instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + static bool isValidIndexedLoad(int &Opcode, int &NewOpcode); + static bool isValidIndexedStore(int &Opcode, int &NewOpcode); +}; +} // namespace + +char HexagonGenMemAbsolute::ID = 0; + +INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute", + "Hexagon Generate Load/Store Set Absolute Address Instruction", + false, false) + +bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(Fn.getFunction())) + return false; + + TII = Fn.getSubtarget<HexagonSubtarget>().getInstrInfo(); + MRI = &Fn.getRegInfo(); + TRI = Fn.getRegInfo().getTargetRegisterInfo(); + + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + + // Loop over all of the basic blocks + for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); + MBBb != MBBe; ++MBBb) { + MachineBasicBlock *MBB = &*MBBb; + // Traverse the basic block + for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); + ++MII) { + MachineInstr *MI = &*MII; + int Opc = MI->getOpcode(); + if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi) + continue; + + const MachineOperand &MO = MI->getOperand(0); + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned DstReg = MO.getReg(); + if (MRI->use_nodbg_empty(DstReg)) + continue; + + typedef MachineRegisterInfo::use_nodbg_iterator use_iterator; + use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg); + + MachineInstr *NextMI = NextUseMI->getParent(); + int NextOpc = NextMI->getOpcode(); + int NewOpc; + bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc); + + if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc)) + continue; + + // Base and Offset positions for load and store instructions + // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm) + // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src) + unsigned BaseRegPos, ImmPos, RegPos; + if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos)) + continue; + RegPos = IsLoad ? 0 : 2; + + bool IsGlobal = MI->getOperand(1).isGlobal(); + if (!MI->getOperand(1).isImm() && !IsGlobal) + continue; + + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool Scalable; + TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI); + + // Ensure BaseOp is non-null and register type. + if (!BaseOp || !BaseOp->isReg()) + continue; + + if (Scalable) + continue; + + unsigned BaseReg = BaseOp->getReg(); + if ((DstReg != BaseReg) || (Offset != 0)) + continue; + + const MachineOperand &MO0 = NextMI->getOperand(RegPos); + + if (!MO0.isReg()) + continue; + + unsigned LoadStoreReg = MO0.getReg(); + + // Store: Bail out if the src and base are same (def and use on same + // register). + if (LoadStoreReg == BaseReg) + continue; + + // Insert the absolute-set instruction "I" only if the use of the + // BaseReg in "I" dominates the rest of the uses of BaseReg and if + // there are more than 2 uses of this BaseReg. + bool Dominates = true; + unsigned Counter = 0; + for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) { + Counter++; + if (!MDT.dominates(NextMI, I->getParent())) + Dominates = false; + } + + if ((!Dominates) || (Counter < 3)) + continue; + + // If we reach here, we have met all the conditions required for the + // replacement of the absolute instruction. + LLVM_DEBUG({ + dbgs() << "Found a pair of instructions for absolute-set " + << (IsLoad ? "load" : "store") << "\n"; + dbgs() << *MI; + dbgs() << *NextMI; + }); + MachineBasicBlock *ParentBlock = NextMI->getParent(); + MachineInstrBuilder MIB; + if (IsLoad) { // Insert absolute-set load instruction + ++HexagonNumLoadAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), LoadStoreReg) + .addReg(DstReg, RegState::Define); + } else { // Insert absolute-set store instruction + ++HexagonNumStoreAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), DstReg); + } + + MachineOperand ImmOperand = MI->getOperand(1); + if (IsGlobal) + MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(), + ImmOperand.getTargetFlags()); + else + MIB.addImm(ImmOperand.getImm()); + + if (IsLoad) + MIB->getOperand(0).setSubReg(MO0.getSubReg()); + else + MIB.addReg(LoadStoreReg, 0, MO0.getSubReg()); + + LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n"); + // Erase the instructions that got replaced. + MII = MBB->erase(MI); + --MII; + NextMI->getParent()->erase(NextMI); + } + } + + return true; +} + +bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::L2_loadrb_io: + NewOpc = Hexagon::L4_loadrb_ap; + break; + case Hexagon::L2_loadrh_io: + NewOpc = Hexagon::L4_loadrh_ap; + break; + case Hexagon::L2_loadri_io: + NewOpc = Hexagon::L4_loadri_ap; + break; + case Hexagon::L2_loadrd_io: + NewOpc = Hexagon::L4_loadrd_ap; + break; + case Hexagon::L2_loadruh_io: + NewOpc = Hexagon::L4_loadruh_ap; + break; + case Hexagon::L2_loadrub_io: + NewOpc = Hexagon::L4_loadrub_ap; + break; + default: + Result = false; + } + + return Result; +} + +bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::S2_storerd_io: + NewOpc = Hexagon::S4_storerd_ap; + break; + case Hexagon::S2_storeri_io: + NewOpc = Hexagon::S4_storeri_ap; + break; + case Hexagon::S2_storerh_io: + NewOpc = Hexagon::S4_storerh_ap; + break; + case Hexagon::S2_storerb_io: + NewOpc = Hexagon::S4_storerb_ap; + break; + default: + Result = false; + } + + return Result; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonGenMemAbsolute() { + return new HexagonGenMemAbsolute(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 619c7dc..91cc930 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const { return getAddrMode(MI) == HexagonII::PostInc; } +bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const { + unsigned BasePos, OffsetPos; + if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return false; + return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm(); +} + // Returns true if an instruction is predicated irrespective of the predicate // sense. For example, all of the following will return true. // if (p0) R1 = add(R2, R3) @@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const { Opcode == Hexagon::J2_loop1rext; } +bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + case Hexagon::L2_loadalignb_pci: + case Hexagon::L2_loadalignb_pcr: + case Hexagon::L2_loadalignh_pci: + case Hexagon::L2_loadalignh_pcr: + case Hexagon::L2_loadbsw2_pci: + case Hexagon::L2_loadbsw2_pcr: + case Hexagon::L2_loadbsw4_pci: + case Hexagon::L2_loadbsw4_pcr: + case Hexagon::L2_loadbzw2_pci: + case Hexagon::L2_loadbzw2_pcr: + case Hexagon::L2_loadbzw4_pci: + case Hexagon::L2_loadbzw4_pcr: + case Hexagon::L2_loadrb_pci: + case Hexagon::L2_loadrb_pcr: + case Hexagon::L2_loadrd_pci: + case Hexagon::L2_loadrd_pcr: + case Hexagon::L2_loadrh_pci: + case Hexagon::L2_loadrh_pcr: + case Hexagon::L2_loadri_pci: + case Hexagon::L2_loadri_pcr: + case Hexagon::L2_loadrub_pci: + case Hexagon::L2_loadrub_pcr: + case Hexagon::L2_loadruh_pci: + case Hexagon::L2_loadruh_pcr: + case Hexagon::S2_storerbnew_pci: + case Hexagon::S2_storerbnew_pcr: + case Hexagon::S2_storerb_pci: + case Hexagon::S2_storerb_pcr: + case Hexagon::S2_storerd_pci: + case Hexagon::S2_storerd_pcr: + case Hexagon::S2_storerf_pci: + case Hexagon::S2_storerf_pcr: + case Hexagon::S2_storerhnew_pci: + case Hexagon::S2_storerhnew_pcr: + case Hexagon::S2_storerh_pci: + case Hexagon::S2_storerh_pcr: + case Hexagon::S2_storerinew_pci: + case Hexagon::S2_storerinew_pcr: + case Hexagon::S2_storeri_pci: + case Hexagon::S2_storeri_pcr: + return true; + } + return false; +} + bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return false; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index e496995..65783c5 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -434,6 +434,8 @@ public: bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const; bool PredOpcodeHasJMP_c(unsigned Opcode) const; bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; + bool isPostIncWithImmOffset(const MachineInstr &MI) const; + bool isCircBufferInstr(const MachineInstr &MI) const; unsigned getAddrMode(const MachineInstr &MI) const; MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset, diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp new file mode 100644 index 0000000..4c845f2 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp @@ -0,0 +1,689 @@ +//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Convert post-inc addressing mode into base-offset addressing mode. +// Ex: +// original loop: +// v1 = phi(v0, v3) +// v2,v3 = post_load v1, 4 + +// Often, unroller creates below form of post-increments: +// v1 = phi(v0, v3') +// v2,v3 = post_load v1, 4 +// v2',v3'= post_load v3, 4 + +// This can be optimized in two ways + +// 1. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v3', -4 +// +// 2. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v1, 4 +// +// Option 2 is favored as we can packetize two memory operations in a single +// packet. However, this is not always favorable due to memory dependences +// and in cases where we form a bigger chain of post-increment ops that will +// create more spills as we can not execute post-increment ops with out +// executing base-offset instructions. +//===----------------------------------------------------------------------===// +#include "HexagonInstrInfo.h" +#include "HexagonSubtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagon-postincopt" + +static cl::opt<unsigned> PostIncChainThreshold( + "post-inc-chain-threshold", cl::Hidden, cl::init(4), + cl::desc("Limit the number of post-inc instructions in a chain.")); + +static cl::opt<bool> PreferPostIncStore( + "prefer-post-inc-store", cl::Hidden, cl::init(true), + cl::desc("Prefer post-inc store in a list of loads and stores.")); + +namespace llvm { +void initializeHexagonPostIncOptPass(PassRegistry &); +FunctionPass *createHexagonPostIncOpt(); +} // namespace llvm + +namespace { + +class HexagonPostIncOpt : public MachineFunctionPass { + MachineLoopInfo *MLI = nullptr; + const HexagonInstrInfo *HII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const HexagonSubtarget *HST = nullptr; + +public: + static char ID; + + HexagonPostIncOpt() : MachineFunctionPass(ID) { + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + bool translatePostIncsInLoop(MachineBasicBlock &MBB); + void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const; + void replacePostIncWithBaseOffset(MachineInstr &MI) const; + bool isPostIncInsn(MachineInstr &MI) const; + void foldAdds(MachineBasicBlock &MBB) const; + void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const; + void removeDeadInstructions(MachineBasicBlock &MBB) const; + + void generatePostInc(MachineBasicBlock &MBB); + bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + + bool isValidOffset(const MachineInstr &MI, int64_t Offset) const; + bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const; +}; + +class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs { + HexagonPostIncOpt &Pass; + +public: + HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF, + MachineLoopInfo *MLI) + : ScheduleDAGInstrs(MF, MLI, false), Pass(P){}; + void schedule() override; + ScheduleDAGTopologicalSort &getTopo() { return Topo; }; +}; + +} // End anonymous namespace. + +char HexagonPostIncOpt::ID = 0; + +INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE, + "Hexagon Post-Inc-Opt Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass", + false, false) + +/// Return true if MIA dominates MIB. +static bool dominates(MachineInstr *MIA, MachineInstr *MIB) { + if (MIA->getParent() != MIB->getParent()) + return false; // Don't know since machine dominator tree is out of date. + + MachineBasicBlock *MBB = MIA->getParent(); + MachineBasicBlock::iterator I = MBB->instr_begin(); + // Iterate over the basic block until MIA or MIB is found. + for (; &*I != MIA && &*I != MIB; ++I) + ; + + // MIA dominates MIB if MIA is found first. + return &*I == MIA; +} + +// Return the Phi register value that comes from the loop block. +static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i).getReg(); + return UINT_MAX; +} + +static bool isAddWithImmValue(const MachineInstr &MI) { + // FIXME: For now, only deal with adds that have strict immediate values. + // Some A2_addi instructions can be of the form. + // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16 + return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm(); +} + +// Compute the number of 'real' instructions in the basic block by +// ignoring terminators. +static unsigned getBasicBlockSize(MachineBasicBlock &MBB) { + unsigned size = 0; + for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator())) + if (!I.isDebugInstr()) + size++; + return size; +} + +// Setup Post increment Schedule DAG. +static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG, + MachineBasicBlock &MBB) { + PIDAG.startBlock(&MBB); + PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(), + getBasicBlockSize(MBB)); + // Build the graph. + PIDAG.schedule(); + // exitRegion() is an empty function in base class. So, safe to call it here. + PIDAG.exitRegion(); +} + +// Check if post-increment candidate has any memory dependence on any +// instruction in the chain. +static bool hasMemoryDependency(SUnit *PostIncSU, + SmallVector<MachineInstr *, 4> &UseList) { + + // FIXME: Fine tune the order dependence. Probably can only consider memory + // related OrderKind. + for (auto &Dep : PostIncSU->Succs) + if (Dep.getKind() == SDep::Order) + if (std::find(UseList.begin(), UseList.end(), + Dep.getSUnit()->getInstr()) != UseList.end()) + return true; + + return false; +} + +// Fold an add with immediate into either an add or a load or a store. +void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n"); + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + if (!isAddWithImmValue(MI)) + continue; + unsigned DefReg = MI.getOperand(0).getReg(); + unsigned AddReg = MI.getOperand(1).getReg(); + int64_t AddImm = MI.getOperand(2).getImm(); + + SmallVector<MachineInstr *, 4> UseList; + // Gather the uses of add instruction's def reg. + for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + // Deal with only the instuctions that belong to this block. + // If we cross this block, the generation of post-increment logic + // will not be able to transform to post-inc due to dominance. + if (UseMI->getParent() == &MBB) + UseList.push_back(UseMI); + } + + if (UseList.empty()) + continue; + + LLVM_DEBUG({ + dbgs() << "Current instruction considered for folding \n"; + MI.dump(); + }); + + for (auto UseMI : UseList) { + if (isAddWithImmValue(*UseMI)) { + int64_t NewImm = AddImm + UseMI->getOperand(2).getImm(); + // Fold if the new immediate is with in the range. + if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + UseMI->getOperand(1).setReg(AddReg); + UseMI->getOperand(2).setImm(NewImm); + LLVM_DEBUG(UseMI->dump()); + } + } else if (HII->isBaseImmOffset(*UseMI)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + updateBaseAndOffset(*UseMI, MI); + LLVM_DEBUG(UseMI->dump()); + } + LLVM_DEBUG(dbgs() << "\n"); + } + } + removeDeadInstructions(MBB); + LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n"); +} + +void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI, + MachineInstr &AddMI) const { + assert(HII->isBaseImmOffset(MI)); + unsigned BasePos, OffsetPos; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + + MachineOperand &OffsetOp = MI.getOperand(OffsetPos); + MachineOperand &BaseOp = MI.getOperand(BasePos); + + if (BaseOp.getReg() != AddMI.getOperand(0).getReg()) + return; + + unsigned IncBase = AddMI.getOperand(1).getReg(); + int64_t IncValue = AddMI.getOperand(2).getImm(); + + int64_t NewOffset = OffsetOp.getImm() + IncValue; + if (!isValidOffset(MI, NewOffset)) + return; + + OffsetOp.setImm(NewOffset); + BaseOp.setReg(IncBase); +} + +void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const { + // For MBB, check that the value defined by each instruction is used. + // If not, delete it. + for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(), + ME = MBB.instr_rend(); + MI != ME;) { + // From DeadMachineInstructionElem. Don't delete inline assembly. + if (MI->isInlineAsm()) { + ++MI; + continue; + } + bool SawStore = false; + // Check if it's safe to remove the instruction due to side effects. + if (!MI->isSafeToMove(nullptr, SawStore)) { + ++MI; + continue; + } + unsigned Uses = 0; + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef()) + continue; + unsigned reg = MOI->getReg(); + // Assume physical registers are used. + if (Register::isPhysicalRegister(reg)) { + Uses++; + continue; + } + if (MRI->use_begin(reg) != MRI->use_end()) + Uses++; + } + if (!Uses) { + MI++->eraseFromParent(); + continue; + } + ++MI; + } +} + +bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const { + // Predicated post-increments are not yet handled. (ISel is not generating + // them yet). Circular buffer instructions should not be handled. + return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) && + !HII->isCircBufferInstr(MI)); +} + +/// For instructions with a base and offset, return true if the new Offset +/// is a valid value with the correct alignment. +bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI, + int64_t Offset) const { + if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false)) + return false; + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + return (Offset & AlignMask) == 0; +} + +bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI, + int IncVal) const { + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + if ((IncVal & AlignMask) != 0) + return false; + + // Number of total bits in the instruction used to encode Inc value. + unsigned IncBits = 4; + // For HVX instructions, the offset is 3. + if (HexagonII::isCVI(MI.getDesc())) + IncBits = 3; + + IncBits += Log2_32(HII->getMemAccessSize(MI)); + if (HII->getMemAccessSize(MI) > 8) + IncBits = 16; + + int MinValidVal = -1U << (IncBits - 1); + int MaxValidVal = ~(-1U << (IncBits - 1)); + return (IncVal >= MinValidVal && IncVal <= MaxValidVal); +} + +void HexagonPostIncOptSchedDAG::schedule() { + AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults(); + buildSchedGraph(AA); +} + +// Replace post-increment operations with base+offset counterpart. +void HexagonPostIncOpt::replacePostIncWithBaseOffset( + MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with " + "base+offset counterparts.\n"); + + SmallVector<MachineInstr *, 4> MIList; + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + // Check for eligible post-inc candidates. + if (!isPostIncInsn(MI)) + continue; + MIList.push_back(&MI); + } + + for (auto MI : MIList) + replacePostIncWithBaseOffset(*MI); + + LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n"); +} + +void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const { + short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode()); + if (NewOpcode < 0) + return; + + unsigned BasePos = 0, OffsetPos = 0; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos); + const MachineOperand &PostIncBase = MI.getOperand(BasePos); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineOperand *PostIncDest; + MachineInstrBuilder MIB; + if (MI.mayLoad()) { + PostIncDest = &MI.getOperand(1); + const MachineOperand &LDValue = MI.getOperand(0); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(PostIncBase).addImm(0); + } else { + PostIncDest = &MI.getOperand(0); + const MachineOperand &STValue = MI.getOperand(3); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(PostIncBase).addImm(0).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), MI); + + // Create an add instruction for the post-inc addition of offset. + MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi)); + MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset); + + LLVM_DEBUG({ + dbgs() << "\n"; + MI.dump(); + dbgs() << "\tis tranformed to \n"; + MIB->dump(); + MIBA->dump(); + dbgs() << "\n\n"; + }); + + MI.eraseFromParent(); +} + +void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) { + LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n"); + MachineBasicBlock::iterator MII = MBB.getFirstNonPHI(); + MachineBasicBlock::iterator MIE = MBB.instr_begin(); + bool isOK = true; + while (MII != MIE) { + MachineInstr *Phi = &*std::prev(MII); + MII = std::prev(MII); + unsigned LoopVal = getLoopPhiReg(Phi, &MBB); + if (LoopVal == UINT_MAX) + continue; + MachineInstr *LoopInst = MRI->getVRegDef(LoopVal); + if (!isAddWithImmValue(*LoopInst)) + continue; + + if (LoopInst->getOpcode() != Hexagon::A2_addi) + continue; + + unsigned AddReg = LoopInst->getOperand(1).getReg(); + int64_t AddImm = LoopInst->getOperand(2).getImm(); + SmallVector<MachineInstr *, 4> UseList; + MachineInstr *PostIncCandidate = nullptr; + + // Find the probable candidates for Post-increment instruction. + SmallVector<MachineInstr *, 4> CandList; + for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + + if (UseMI == LoopInst) + continue; + + if (!dominates(UseMI, LoopInst)) { + isOK = false; + break; + } + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool OffsetIsScalable; + if (!HII->isBaseImmOffset(*UseMI) || + !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset, + OffsetIsScalable, TRI)) { + isOK = false; + break; + } + int64_t NewOffset = Offset - AddImm; + if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() || + BaseOp->getReg() != AddReg) { + isOK = false; + break; + } + if (OffsetIsScalable) { + isOK = false; + break; + } + if (Offset == 0) { + // If you have stores in the chain, make sure they are in the beginning + // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST, + // ST. + if (UseMI->mayStore() && PreferPostIncStore) + CandList.insert(CandList.begin(), UseMI); + else + CandList.push_back(UseMI); + continue; + } + UseList.push_back(UseMI); + } + + if (!isOK) + continue; + + for (auto MI : CandList) { + if (!PostIncCandidate) + PostIncCandidate = MI; + // Push the rest of the list for updation. + else + UseList.push_back(MI); + } + + // If a candidate is found, replace it with the post-inc instruction. + // Also, adjust offset for other uses as needed. + if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst)) + continue; + + // Logic to determine what the base register to be. + // There are two choices: + // 1. New address register after we updated the post-increment candidate. + // v2,v3 = post_load v1, 4 + // v3 is the choice here. + // 2. The base register we used in post-increment candidate. + // v2,v3 = post_load v1, 4 + // v1 is the choice here. + // Use v3 if there is a memory dependence between post-inc instruction and + // any other instruction in the chain. + // FIXME: We can do some complex DAG analysis based off height and depth and + // selectively update other instructions in the chain. Use v3 if there are + // more instructions in the chain, otherwise we will end up increasing the + // height of the DAG resulting in more spills. By default we have a + // threshold controlled by the option "post-inc-chain-threshold" which is + // set to 4. v1 is preferred as we can packetize two memory operations in a + // single packet in scalar core. But it heavily depends on the structure of + // DAG. + bool UpdateBaseToNew = false; + + // Do not bother to build a DAG and analyze if the Use list is empty. + if (!UseList.empty()) { + MachineFunction *MF = MBB.getParent(); + // Setup the Post-inc schedule DAG. + HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI); + initPISchedDAG(PIDAG, MBB); + SUnit *SU = PIDAG.getSUnit(PostIncCandidate); + if (hasMemoryDependency(SU, UseList) || + UseList.size() >= PostIncChainThreshold) + UpdateBaseToNew = true; + } + + if (UpdateBaseToNew) { + LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the " + "base register of post-increment\n"); + for (auto UseMI : UseList) { + if (!dominates(PostIncCandidate, UseMI)) + continue; + unsigned BasePos, OffsetPos; + if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) { + // New offset has already been validated; no need to do it again. + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is transformed to \n"; + }); + int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm; + UseMI->getOperand(OffsetPos).setImm(NewOffset); + UseMI->getOperand(BasePos).setReg(LoopVal); + LLVM_DEBUG(UseMI->dump()); + } + } + } + replaceWithPostInc(PostIncCandidate, LoopInst); + } + LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n"); +} + +bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0) + return false; + assert(AddMI->getOpcode() == Hexagon::A2_addi); + return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm()); +} + +void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode()); + assert(NewOpcode >= 0 && + "Couldn't change base offset to post-increment form"); + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + const MachineOperand &IncDest = AddMI->getOperand(0); + const MachineOperand &IncBase = AddMI->getOperand(1); + const MachineOperand &IncValue = AddMI->getOperand(2); + MachineInstrBuilder MIB; + LLVM_DEBUG({ + dbgs() << "\n\n"; + MI->dump(); + dbgs() << "\t is tranformed to post-inc form of \n"; + }); + + if (MI->mayLoad()) { + const MachineOperand &LDValue = MI->getOperand(0); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue); + } else { + const MachineOperand &STValue = MI->getOperand(2); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), *MI); + + LLVM_DEBUG({ + MIB->dump(); + dbgs() << "As a result this add instruction is erased.\n"; + AddMI->dump(); + }); + + MI->eraseFromParent(); + AddMI->eraseFromParent(); +} + +bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) { + // Algorithm: + // 1. Replace all the post-inc instructions with Base+Offset instruction and + // an add instruction in this block. + // 2. Fold all the adds in to respective uses. + // 3. Generate post-increment instructions and update the uses of the base + // register if needed based on constraints. + + replacePostIncWithBaseOffset(MBB); + foldAdds(MBB); + generatePostInc(MBB); + return true; +} + +bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) { + + // Skip pass if requested. + if (skipFunction(MF.getFunction())) + return false; + + // Get Target Information. + MLI = &getAnalysis<MachineLoopInfo>(); + HST = &MF.getSubtarget<HexagonSubtarget>(); + TRI = HST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + HII = HST->getInstrInfo(); + + // Skip this pass for TinyCore. + // Tiny core allwos partial post increment operations - This constraint can + // be imposed inside the pass. In a chain of post-increments, the first can + // be post-increment, rest can be adjusted to base+offset (these are + // inexpensive in most of the cases); + if (HST->isTinyCore()) + return false; + + LLVM_DEBUG({ + dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n"; + dbgs() << "Function: " << MF.getName() << "\n"; + }); + bool Change = false; + std::vector<MachineBasicBlock *> MLBB; + for (auto &BB : MF) { + // Check if this Basic Block belongs to any loop. + auto *LI = MLI->getLoopFor(&BB); + // We only deal with inner-most loops that has one block. + if (LI && LI->getBlocks().size() == 1) { + MachineBasicBlock *MBB = LI->getHeader(); + // Do not traverse blocks that are already visited. + if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end()) + continue; + + MLBB.push_back(MBB); + + LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n"); + Change |= translatePostIncsInLoop(*MBB); + } + } + LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n"); + return Change; +} + +FunctionPass *llvm::createHexagonPostIncOpt() { + return new HexagonPostIncOpt(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 7d4b420..a5ebd64 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::desc("Early expansion of MUX")); +static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true), + cl::Hidden, + cl::desc("Cleanup of TFRs/COPYs")); + static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden, cl::desc("Enable early if-conversion")); @@ -92,6 +96,10 @@ static cl::opt<bool> static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden, cl::desc("Disable splitting double registers")); +static cl::opt<bool> + EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden, + cl::desc("Generate absolute set instructions")); + static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true), cl::Hidden, cl::desc("Bit simplification")); @@ -121,6 +129,10 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden, cl::init(true), cl::desc("Enable instsimplify")); +static cl::opt<bool> DisableHexagonPostIncOpt( + "hexagon-postinc-opt", cl::Hidden, + cl::desc("Disable Hexagon post-increment optimization")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -145,20 +157,24 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", namespace llvm { extern char &HexagonExpandCondsetsID; + extern char &HexagonTfrCleanupID; void initializeHexagonBitSimplifyPass(PassRegistry&); void initializeHexagonConstExtendersPass(PassRegistry&); void initializeHexagonConstPropagationPass(PassRegistry&); void initializeHexagonCopyToCombinePass(PassRegistry&); void initializeHexagonEarlyIfConversionPass(PassRegistry&); void initializeHexagonExpandCondsetsPass(PassRegistry&); + void initializeHexagonGenMemAbsolutePass(PassRegistry &); void initializeHexagonGenMuxPass(PassRegistry&); void initializeHexagonHardwareLoopsPass(PassRegistry&); void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &); void initializeHexagonNewValueJumpPass(PassRegistry&); void initializeHexagonOptAddrModePass(PassRegistry&); void initializeHexagonPacketizerPass(PassRegistry&); + void initializeHexagonPostIncOptPass(PassRegistry &); void initializeHexagonRDFOptPass(PassRegistry&); void initializeHexagonSplitDoubleRegsPass(PassRegistry&); + void initializeHexagonTfrCleanupPass(PassRegistry &); void initializeHexagonVExtractPass(PassRegistry &); void initializeHexagonVectorCombineLegacyPass(PassRegistry&); void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &); @@ -177,6 +193,7 @@ namespace llvm { FunctionPass *createHexagonFixupHwLoops(); FunctionPass *createHexagonGenExtract(); FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenMemAbsolute(); FunctionPass *createHexagonGenMux(); FunctionPass *createHexagonGenPredicate(); FunctionPass *createHexagonHardwareLoops(); @@ -188,10 +205,12 @@ namespace llvm { FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(bool Minimal); FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonPostIncOpt(); FunctionPass *createHexagonRDFOpt(); FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonSplitDoubleRegs(); FunctionPass *createHexagonStoreWidening(); + FunctionPass *createHexagonTfrCleanup(); FunctionPass *createHexagonVectorCombineLegacyPass(); FunctionPass *createHexagonVectorPrint(); FunctionPass *createHexagonVExtract(); @@ -211,12 +230,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() { initializeHexagonConstPropagationPass(PR); initializeHexagonCopyToCombinePass(PR); initializeHexagonEarlyIfConversionPass(PR); + initializeHexagonGenMemAbsolutePass(PR); initializeHexagonGenMuxPass(PR); initializeHexagonHardwareLoopsPass(PR); initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR); initializeHexagonNewValueJumpPass(PR); initializeHexagonOptAddrModePass(PR); initializeHexagonPacketizerPass(PR); + initializeHexagonPostIncOptPass(PR); initializeHexagonRDFOptPass(PR); initializeHexagonSplitDoubleRegsPass(PR); initializeHexagonVectorCombineLegacyPass(PR); @@ -244,6 +265,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, (HexagonNoOpt ? CodeGenOptLevel::None : OL)), TLOF(std::make_unique<HexagonTargetObjectFile>()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); + initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry()); + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -411,11 +434,20 @@ void HexagonPassConfig::addPreRegAlloc() { addPass(createHexagonConstExtenders()); if (EnableExpandCondsets) insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID); + if (EnableTfrCleanup) + insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID); if (!DisableStoreWidening) addPass(createHexagonStoreWidening()); + if (EnableGenMemAbs) + addPass(createHexagonGenMemAbsolute()); if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops()); } + + if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive) + if (!DisableHexagonPostIncOpt) + addPass(createHexagonPostIncOpt()); + if (TM->getOptLevel() >= CodeGenOptLevel::Default) addPass(&MachinePipelinerID); } diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp new file mode 100644 index 0000000..a4b359a --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -0,0 +1,324 @@ +//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass is to address a situation that appears after register allocaion +// evey now and then, namely a register copy from a source that was defined +// as an immediate value in the same block (usually just before the copy). +// +// Here is an example of actual code emitted that shows this problem: +// +// .LBB0_5: +// { +// r5 = zxtb(r8) +// r6 = or(r6, ##12345) +// } +// { +// r3 = xor(r1, r2) +// r1 = #0 <-- r1 set to #0 +// } +// { +// r7 = r1 <-- r7 set to r1 +// r0 = zxtb(r3) +// } + +#define DEBUG_TYPE "tfr-cleanup" +#include "HexagonTargetMachine.h" + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace llvm { +FunctionPass *createHexagonTfrCleanup(); +void initializeHexagonTfrCleanupPass(PassRegistry &); +} // namespace llvm + +namespace { +class HexagonTfrCleanup : public MachineFunctionPass { +public: + static char ID; + HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) { + PassRegistry &R = *PassRegistry::getPassRegistry(); + initializeHexagonTfrCleanupPass(R); + } + StringRef getPassName() const override { return "Hexagon TFR Cleanup"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + const HexagonInstrInfo *HII; + const TargetRegisterInfo *TRI; + + typedef DenseMap<unsigned, uint64_t> ImmediateMap; + + bool isIntReg(unsigned Reg, bool &Is32); + void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap); + bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap); + bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap); + bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes); + bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes); +}; +} // namespace + +char HexagonTfrCleanup::ID = 0; + +namespace llvm { +char &HexagonTfrCleanupID = HexagonTfrCleanup::ID; +} + +bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) { + Is32 = Hexagon::IntRegsRegClass.contains(Reg); + return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg); +} + +// Assign given value V32 to the specified the register R32 in the map. Only +// 32-bit registers are valid arguments. +void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) { + ImmediateMap::iterator F = IMap.find(R32); + if (F == IMap.end()) + IMap.insert(std::make_pair(R32, V32)); + else + F->second = V32; +} + +// Retrieve a value of the provided register Reg and store it into Val. +// Return "true" if a value was found, "false" otherwise. +bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val, + ImmediateMap &IMap) { + bool Is32; + if (!isIntReg(Reg, Is32)) + return false; + + if (Is32) { + ImmediateMap::iterator F = IMap.find(Reg); + if (F == IMap.end()) + return false; + Val = F->second; + return true; + } + + // For 64-bit registers, compose the value from the values of its + // subregisters. + unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo); + unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi); + ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH); + if (FL == IMap.end() || FH == IMap.end()) + return false; + Val = (FH->second << 32) | FL->second; + return true; +} + +// Process an instruction and record the relevant information in the imme- +// diate map. +bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) { + using namespace Hexagon; + + if (MI->isCall()) { + IMap.clear(); + return true; + } + + // If this is an instruction that loads a constant into a register, + // record this information in IMap. + unsigned Opc = MI->getOpcode(); + if (Opc == A2_tfrsi || Opc == A2_tfrpi) { + unsigned DefR = MI->getOperand(0).getReg(); + bool Is32; + if (!isIntReg(DefR, Is32)) + return false; + if (!MI->getOperand(1).isImm()) { + if (!Is32) { + IMap.erase(TRI->getSubReg(DefR, isub_lo)); + IMap.erase(TRI->getSubReg(DefR, isub_hi)); + } else { + IMap.erase(DefR); + } + return false; + } + uint64_t Val = MI->getOperand(1).getImm(); + // If it's a 64-bit register, break it up into subregisters. + if (!Is32) { + uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU); + setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap); + setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap); + } else { + setReg(DefR, Val, IMap); + } + return true; + } + + // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap. + for (MachineInstr::mop_iterator Mo = MI->operands_begin(), + E = MI->operands_end(); + Mo != E; ++Mo) { + if (Mo->isRegMask()) { + IMap.clear(); + return true; + } + if (!Mo->isReg() || !Mo->isDef()) + continue; + unsigned R = Mo->getReg(); + for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) { + ImmediateMap::iterator F = IMap.find(*AR); + if (F != IMap.end()) + IMap.erase(F); + } + } + return true; +} + +// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that +// has a known constant value. +bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, + SlotIndexes *Indexes) { + using namespace Hexagon; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case A2_tfr: + case A2_tfrp: + case COPY: + break; + default: + return false; + } + + unsigned DstR = MI->getOperand(0).getReg(); + unsigned SrcR = MI->getOperand(1).getReg(); + bool Tmp, Is32; + if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp)) + return false; + assert(Tmp == Is32 && "Register size mismatch"); + uint64_t Val; + bool Found = getReg(SrcR, Val, IMap); + if (!Found) + return false; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + int64_t SVal = Is32 ? int32_t(Val) : Val; + auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>(); + MachineInstr *NewMI; + if (Is32) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal); + else if (isInt<8>(SVal)) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal); + else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL))) + NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR) + .addImm(int32_t(SVal >> 32)) + .addImm(int32_t(Val & 0xFFFFFFFFLL)); + else if (HST.isTinyCore()) + // Disable generating CONST64 since it requires load resource. + return false; + else + NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val); + + // Replace the MI to reuse the same slot index + if (Indexes) + Indexes->replaceMachineInstrInMaps(*MI, *NewMI); + MI->eraseFromParent(); + return true; +} + +// Remove the instruction if it is a self-assignment. +bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI, + SlotIndexes *Indexes) { + unsigned Opc = MI->getOpcode(); + unsigned DefR, SrcR; + bool IsUndef = false; + switch (Opc) { + case Hexagon::A2_tfr: + // Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(1).getReg(); + IsUndef = MI->getOperand(1).isUndef(); + break; + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + // if ([!]Pu) Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(2).getReg(); + IsUndef = MI->getOperand(2).isUndef(); + break; + default: + return false; + } + if (DefR != SrcR) + return false; + if (IsUndef) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR); + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef() && Op.isImplicit()) + DefI->addOperand(Op); + } + + if (Indexes) + Indexes->removeMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + return true; +} + +bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Map: 32-bit register -> immediate value. + // 64-bit registers are stored through their subregisters. + ImmediateMap IMap; + SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>(); + + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + HII = HST.getInstrInfo(); + TRI = HST.getRegisterInfo(); + + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock &B = *I; + MachineBasicBlock::iterator J, F, NextJ; + IMap.clear(); + bool Inserted = false, Erased = false; + for (J = B.begin(), F = B.end(); J != F; J = NextJ) { + NextJ = std::next(J); + MachineInstr *MI = &*J; + bool E = eraseIfRedundant(MI, Indexes); + Erased |= E; + if (E) + continue; + Inserted |= rewriteIfImm(MI, IMap, Indexes); + MachineBasicBlock::iterator NewJ = std::prev(NextJ); + updateImmMap(&*NewJ, IMap); + } + bool BlockC = Inserted | Erased; + Changed |= BlockC; + if (BlockC && Indexes) + Indexes->repairIndexesInRange(&B, B.begin(), B.end()); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// +INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false, + false) + +FunctionPass *llvm::createHexagonTfrCleanup() { + return new HexagonTfrCleanup(); +} diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index ca98269..9840412 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -18,6 +18,7 @@ #include "HexagonDepITypes.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/MC/MCInstrDesc.h" namespace llvm { @@ -48,7 +49,7 @@ namespace HexagonII { // MCInstrDesc TSFlags // *** Must match HexagonInstrFormat*.td *** - enum { + enum HexagonTSFlagsVal { // This 7-bit field describes the insn type. TypePos = 0, TypeMask = 0x7f, @@ -173,6 +174,11 @@ namespace HexagonII { hasUnaryRestrictionMask = 0x1, }; + inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos, + unsigned Mask) { + return (MID.TSFlags >> Pos) & Mask; + } + // *** The code above must match HexagonInstrFormat*.td *** // // Hexagon specific MO operand flag mask. @@ -275,6 +281,10 @@ namespace HexagonII { INST_ICLASS_ALU32_3 = 0xf0000000 }; + inline bool isCVI(const MCInstrDesc &MID) { + return getTSFlags(MID, isCVIPos, isCVIMask) != 0; + } + LLVM_ATTRIBUTE_UNUSED static unsigned getMemAccessSizeInBytes(MemAccessSize S) { switch (S) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ded2f25..3ff8994 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); + if (Opcode == NVPTX::StoreRetvalI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, @@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { NVPTX::StoreParamI8, NVPTX::StoreParamI16, NVPTX::StoreParamI32, NVPTX::StoreParamI64, NVPTX::StoreParamF32, NVPTX::StoreParamF64); + if (Opcode == NVPTX::StoreParamI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreParamI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7d2fe78..66a1010 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -59,6 +60,7 @@ #include <cmath> #include <cstdint> #include <iterator> +#include <optional> #include <sstream> #include <string> #include <utility> @@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } +static bool adjustElementType(EVT &ElementType) { + switch (ElementType.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::f16: + case MVT::bf16: + ElementType = MVT::i16; + return true; + case MVT::f32: + case MVT::v2f16: + case MVT::v2bf16: + ElementType = MVT::i32; + return true; + case MVT::f64: + ElementType = MVT::i64; + return true; + } +} + +// Use byte-store when the param address of the argument value is unaligned. +// This may happen when the return value is a field of a packed structure. +// +// This is called in LowerCall() when passing the param values. +static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue StVal, SDValue &InGlue, + unsigned ArgID, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); + + // Store each byte + SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal, InGlue}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode( + NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, + MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); + InGlue = Chain.getValue(1); + } + return Chain; +} + +// Use byte-load when the param adress of the returned value is unaligned. +// This may happen when the returned value is a field of a packed structure. +static SDValue +LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, + EVT ElementType, SDValue &InGlue, + SmallVectorImpl<SDValue> &TempProxyRegOps, + const SDLoc &dl) { + // Bit logic only works on integer types + EVT MergedType = ElementType; + adjustElementType(MergedType); + + // Load each byte and construct the whole value. Initial value to 0 + SDValue RetVal = DAG.getConstant(0, dl, MergedType); + // LoadParamMemI8 loads into i16 register only + SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + InGlue}; + // This will be selected to LoadParamMemI8 + SDValue LdVal = + DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, + MVT::i8, MachinePointerInfo(), Align(1)); + SDValue TmpLdVal = LdVal.getValue(0); + Chain = LdVal.getValue(1); + InGlue = LdVal.getValue(2); + + TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, + TmpLdVal.getSimpleValueType(), TmpLdVal); + TempProxyRegOps.push_back(TmpLdVal); + + SDValue CMask = DAG.getConstant(255, dl, MergedType); + SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); + // Need to extend the i16 register to the whole width. + TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); + // Mask off the high bits. Leave only the lower 8bits. + // Do this because we are using loadparam.b8. + TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); + // Shift and merge + TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); + RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); + } + if (ElementType != MergedType) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + return RetVal; +} + SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedAlign) PartAlign = commonAlignment(ArgAlign, CurOffset); - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back( - DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant( - IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), - dl, MVT::i32)); - } - SDValue StVal = OutVals[OIdx]; MVT PromotedVT; @@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar store. In such cases, fall back to byte stores. + if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && + PartAlign.value() < + DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + Chain = LowerUnalignedStoreParam( + DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, + StVal, InGlue, ParamCount, dl); + + // LowerUnalignedStoreParam took care of inserting the necessary nodes + // into the SDAG, so just move on to the next element. + if (!IsByVal) + ++OIdx; + continue; + } + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back( + DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); + + StoreOperands.push_back(DAG.getConstant( + IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), + dl, MVT::i32)); + } + // Record the value to store. StoreOperands.push_back(StVal); @@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 16> ProxyRegOps; SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; + // An item of the vector is filled if the element does not need a ProxyReg + // operation on it and should be added to InVals as is. ProxyRegOps and + // ProxyRegTruncates contain empty/none items at the same index. + SmallVector<SDValue, 16> RetElts; + // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` + // to use the values of `LoadParam`s and to be replaced later then + // `CALLSEQ_END` is added. + SmallVector<SDValue, 16> TempProxyRegOps; // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { @@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EltType = MVT::i16; } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar load. In such cases, fall back to byte loads. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && + EltAlign < DL.getABITypeAlign( + TheLoadType.getTypeForEVT(*DAG.getContext()))) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + SDValue Ret = LowerUnalignedLoadRetParam( + DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); + ProxyRegOps.push_back(SDValue()); + ProxyRegTruncates.push_back(std::optional<MVT>()); + RetElts.resize(i); + RetElts.push_back(Ret); + + continue; + } + // Record index of the very first element of the vector. if (VectorInfo[i] & PVF_FIRST) { assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); @@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { + if (i < RetElts.size() && RetElts[i]) { + InVals.push_back(RetElts[i]); + continue; + } + SDValue Ret = DAG.getNode( NVPTXISD::ProxyReg, dl, DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), @@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InVals.push_back(Ret); } + for (SDValue &T : TempProxyRegOps) { + SDValue Repl = DAG.getNode( + NVPTXISD::ProxyReg, dl, + DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), + {Chain, T.getOperand(0), InGlue}); + DAG.ReplaceAllUsesWith(T, Repl); + DAG.RemoveDeadNode(T.getNode()); + + Chain = Repl.getValue(1); + InGlue = Repl.getValue(2); + } + // set isTailCall to false for now, until we figure out how to express // tail call optimization in PTX isTailCall = false; @@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); Value *srcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + + const MaybeAlign PartAlign = [&]() -> MaybeAlign { + if (aggregateIsPacked) + return Align(1); + if (NumElts != 1) + return std::nullopt; + Align PartAlign = + (Offsets[parti] == 0 && PAL.getParamAlignment(i)) + ? PAL.getParamAlignment(i).value() + : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); + return commonAlignment(PartAlign, Offsets[parti]); + }(); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, - MachinePointerInfo(srcValue), - MaybeAlign(aggregateIsPacked ? 1 : 0), + MachinePointerInfo(srcValue), PartAlign, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); if (P.getNode()) @@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return Chain; } +// Use byte-store when the param adress of the return value is unaligned. +// This may happen when the return value is a field of a packed structure. +static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue RetVal, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + // Store each byte + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), StoreOperands, + MVT::i8, MachinePointerInfo(), std::nullopt, + MachineMemOperand::MOStore); + } + return Chain; +} + SDValue NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<SDValue, 6> StoreOperands; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - // New load/store. Record chain and offset operands. - if (VectorInfo[i] & PVF_FIRST) { - assert(StoreOperands.empty() && "Orphaned operand list."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); - } - SDValue OutVal = OutVals[i]; SDValue RetVal = PromotedOutVals[i]; @@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); } + // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned + // for a scalar store. In such cases, fall back to byte stores. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { + EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Align ElementTypeAlign = + DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); + Align ElementAlign = + commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); + if (ElementAlign < ElementTypeAlign) { + assert(StoreOperands.empty() && "Orphaned operand list."); + Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, + RetVal, dl); + + // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes + // into the graph, so just move on to the next element. + continue; + } + } + + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + // Record the value to return. StoreOperands.push_back(RetVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 55a1955..b3517ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; +def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">; +def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">; def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; @@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; +def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">; +def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">; def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 904f1d7..c922098 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = SubVecVT; // Establish the correct scalable-vector types for any fixed-length type. - if (SubVecVT.isFixedLengthVector()) + if (SubVecVT.isFixedLengthVector()) { + assert(Idx == 0 && V.isUndef()); SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); + } if (VT.isFixedLengthVector()) VT = TLI.getContainerForFixedLengthVector(VT); @@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = VT; // Establish the correct scalable-vector types for any fixed-length type. - if (VT.isFixedLengthVector()) + if (VT.isFixedLengthVector()) { + assert(Idx == 0); SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT); + } if (InVT.isFixedLengthVector()) InVT = TLI.getContainerForFixedLengthVector(InVT); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7275eb..540c2e7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND, ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, - ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE}; + ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, + ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, + ISD::VP_USUBSAT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, @@ -830,7 +832,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, VT, Custom); setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT, Custom); - setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal); @@ -956,6 +957,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // between vXf16 and vXf64 must be lowered as sequences which convert via // vXf32. setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); + setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); // Custom-lower insert/extract operations to simplify patterns. setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT, Custom); @@ -3240,45 +3242,49 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF, // Note that this method will also match potentially unappealing index // sequences, like <i32 0, i32 50939494>, however it is left to the caller to // determine whether this is worth generating code for. -static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { - unsigned NumElts = Op.getNumOperands(); +static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op, + unsigned EltSizeInBits) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR"); + if (!cast<BuildVectorSDNode>(Op)->isConstant()) + return std::nullopt; bool IsInteger = Op.getValueType().isInteger(); std::optional<unsigned> SeqStepDenom; std::optional<int64_t> SeqStepNum, SeqAddend; std::optional<std::pair<uint64_t, unsigned>> PrevElt; - unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits(); - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - // Assume undef elements match the sequence; we just have to be careful - // when interpolating across them. - if (Op.getOperand(Idx).isUndef()) + assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits()); + + // First extract the ops into a list of constant integer values. This may not + // be possible for floats if they're not all representable as integers. + SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands()); + const unsigned OpSize = Op.getScalarValueSizeInBits(); + for (auto [Idx, Elt] : enumerate(Op->op_values())) { + if (Elt.isUndef()) { + Elts[Idx] = std::nullopt; continue; - - uint64_t Val; + } if (IsInteger) { - // The BUILD_VECTOR must be all constants. - if (!isa<ConstantSDNode>(Op.getOperand(Idx))) - return std::nullopt; - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes<uint64_t>(EltSizeInBits); + Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize); } else { - // The BUILD_VECTOR must be all constants. - if (!isa<ConstantFPSDNode>(Op.getOperand(Idx))) - return std::nullopt; - if (auto ExactInteger = getExactInteger( - cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits)) - Val = *ExactInteger; - else + auto ExactInteger = + getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize); + if (!ExactInteger) return std::nullopt; + Elts[Idx] = *ExactInteger; } + } + + for (auto [Idx, Elt] : enumerate(Elts)) { + // Assume undef elements match the sequence; we just have to be careful + // when interpolating across them. + if (!Elt) + continue; if (PrevElt) { // Calculate the step since the last non-undef element, and ensure // it's consistent across the entire sequence. unsigned IdxDiff = Idx - PrevElt->second; - int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits); + int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits); // A zero-value value difference means that we're somewhere in the middle // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a @@ -3308,8 +3314,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { } // Record this non-undef element for later. - if (!PrevElt || PrevElt->first != Val) - PrevElt = std::make_pair(Val, Idx); + if (!PrevElt || PrevElt->first != *Elt) + PrevElt = std::make_pair(*Elt, Idx); } // We need to have logged a step for this to count as a legal index sequence. @@ -3318,21 +3324,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { // Loop back through the sequence and validate elements we might have skipped // while waiting for a valid step. While doing this, log any sequence addend. - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - if (Op.getOperand(Idx).isUndef()) + for (auto [Idx, Elt] : enumerate(Elts)) { + if (!Elt) continue; - uint64_t Val; - if (IsInteger) { - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes<uint64_t>(EltSizeInBits); - } else { - Val = *getExactInteger( - cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits); - } uint64_t ExpectedVal = (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom; - int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits); + int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits); if (!SeqAddend) SeqAddend = Addend; else if (Addend != SeqAddend) @@ -3598,7 +3595,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. - if (auto SimpleVID = isSimpleVIDSequence(Op)) { + if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) { int64_t StepNumerator = SimpleVID->StepNumerator; unsigned StepDenominator = SimpleVID->StepDenominator; int64_t Addend = SimpleVID->Addend; @@ -3853,11 +3850,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // If we're compiling for an exact VLEN value, we can split our work per // register in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) { + if (const auto VLen = Subtarget.getRealVLen(); + VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) { MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); @@ -4768,9 +4764,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, // If we don't know exact data layout, not much we can do. If this // is already m1 or smaller, no point in splitting further. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen) + const auto VLen = Subtarget.getRealVLen(); + if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen) return SDValue(); // Avoid picking up bitrotate patterns which we have a linear-in-lmul @@ -4781,7 +4776,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, return SDValue(); MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned VRegsPerSrc = NumElts / ElemsPerVReg; SmallVector<std::pair<int, SmallVector<int>>> @@ -5759,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) { VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP VP_CASE(BITREVERSE) // VP_BITREVERSE + VP_CASE(SADDSAT) // VP_SADDSAT + VP_CASE(UADDSAT) // VP_UADDSAT + VP_CASE(SSUBSAT) // VP_SSUBSAT + VP_CASE(USUBSAT) // VP_USUBSAT VP_CASE(BSWAP) // VP_BSWAP VP_CASE(CTLZ) // VP_CTLZ VP_CASE(CTTZ) // VP_CTTZ @@ -6798,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_UDIV: case ISD::VP_SREM: case ISD::VP_UREM: + case ISD::VP_UADDSAT: + case ISD::VP_USUBSAT: + case ISD::VP_SADDSAT: + case ISD::VP_SSUBSAT: return lowerVPOp(Op, DAG); case ISD::VP_AND: case ISD::VP_OR: @@ -7384,6 +7387,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) return V; + // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1) + // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2) + if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) { + const APInt &TrueVal = TrueV->getAsAPIntVal(); + const APInt &FalseVal = FalseV->getAsAPIntVal(); + const int TrueValCost = RISCVMatInt::getIntMatCost( + TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + const int FalseValCost = RISCVMatInt::getIntMatCost( + FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + bool IsCZERO_NEZ = TrueValCost <= FalseValCost; + SDValue LHSVal = DAG.getConstant( + IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT); + SDValue RHSVal = + DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT); + SDValue CMOV = + DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ, + DL, VT, LHSVal, CondV); + return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal); + } + // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c)) // Unless we have the short forward branch optimization. if (!Subtarget.hasConditionalMoveFusion()) @@ -8313,15 +8336,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, // constant index, we can always perform the extract in m1 (or // smaller) as we can determine the register corresponding to // the index in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx); - IdxC && MinVLen == MaxVLen && - VecVT.getSizeInBits().getKnownMinValue() > MinVLen) { + IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) { MVT M1VT = getLMUL1VT(ContainerVT); unsigned OrigIdx = IdxC->getZExtValue(); EVT ElemVT = VecVT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned RemIdx = OrigIdx % ElemsPerVReg; unsigned SubRegIdx = OrigIdx / ElemsPerVReg; unsigned ExtractIdx = @@ -9782,15 +9803,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, if (OrigIdx == 0) return Op; - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); // If the subvector vector is a fixed-length type and we don't know VLEN // exactly, we cannot use subregister manipulation to simplify the codegen; we // don't know which register of a LMUL group contains the specific subvector // as we only know the minimum register size. Therefore we must slide the // vector group down the full amount. - if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) { + if (SubVecVT.isFixedLengthVector() && !VLen) { MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(VecVT); @@ -9837,8 +9857,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if // we have a fixed length subvector, we need to adjust the index by 1/vscale. if (SubVecVT.isFixedLengthVector()) { - assert(MinVLen == MaxVLen); - unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock; + assert(VLen); + unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; auto Decompose = RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI); @@ -12872,6 +12892,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineSubOfBoolean(N, DAG)) return V; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1) @@ -12879,7 +12900,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, isNullConstant(N1.getOperand(1))) { ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get(); if (CCVal == ISD::SETLT) { - EVT VT = N->getValueType(0); SDLoc DL(N); unsigned ShAmt = N0.getValueSizeInBits() - 1; return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), @@ -12887,6 +12907,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, } } + // sub (zext, zext) -> sext (sub (zext, zext)) + // where the sum of the extend widths match, and the inner zexts + // add at least one bit. (For profitability on rvv, we use a + // power of two for both inner and outer extend.) + if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) && + N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND && + N0.hasOneUse() && N1.hasOneUse()) { + SDValue Src0 = N0.getOperand(0); + SDValue Src1 = N1.getOperand(0); + EVT SrcVT = Src0.getValueType(); + if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) && + SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 && + SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) { + LLVMContext &C = *DAG.getContext(); + EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C); + EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount()); + Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0); + Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, + DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1)); + } + } + // fold (sub x, (select lhs, rhs, cc, 0, y)) -> // (select lhs, rhs, cc, x, (sub x, y)) return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget); @@ -15978,7 +16021,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (Index.getOpcode() == ISD::BUILD_VECTOR && MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) { - if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index); + // The sequence will be XLenVT, not the type of Index. Tell + // isSimpleVIDSequence this so we avoid overflow. + if (std::optional<VIDSequence> SimpleVID = + isSimpleVIDSequence(Index, Subtarget.getXLen()); SimpleVID && SimpleVID->StepDenominator == 1) { const int64_t StepNumerator = SimpleVID->StepNumerator; const int64_t Addend = SimpleVID->Addend; diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index ff21fe1..af864ba 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) { // Return true if MI is a load for which there exists a compressed version. static bool isCompressibleLoad(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) || - Opcode == RISCV::LD || Opcode == RISCV::FLD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::LW: + case RISCV::LD: + return STI.hasStdExtCOrZca(); + case RISCV::FLW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FLD: + return STI.hasStdExtCOrZcd(); + } } // Return true if MI is a store for which there exists a compressed version. static bool isCompressibleStore(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) || - Opcode == RISCV::SD || Opcode == RISCV::FSD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::SW: + case RISCV::SD: + return STI.hasStdExtCOrZca(); + case RISCV::FSW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FSD: + return STI.hasStdExtCOrZcd(); + } } // Find a single register and/or large offset which, if compressible, would @@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) { const RISCVInstrInfo &TII = *STI.getInstrInfo(); // This optimization only makes sense if compressed instructions are emitted. - // FIXME: Support Zca, Zcf, Zcd granularity. - if (!STI.hasStdExtC()) + if (!STI.hasStdExtCOrZca()) return false; for (MachineBasicBlock &MBB : Fn) { diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index d15cb61..0be681d 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, let ReleaseAtCycles = noPredReleaseCycles; } + // Define SchedVars + def nameMX # PredSchedVar + : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>; + def nameMX # NoPredSchedVar + : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>; + // Allow multiclass to refer to SchedVars -- need to have NAME prefix. + defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar); + defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar); + // Tie behavior to predicate - def NAME # nameMX # "_Variant" : SchedWriteVariant<[ - SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>, - SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]> - ]>; + def NAME # nameMX # "_Variant" + : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< !cast<SchedReadWrite>(nameMX), !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>; if IsWorstCase then { - def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[ - SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>, - SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]> - ]>; + def NAME # name # "_WorstCase_Variant" + : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< !cast<SchedReadWrite>(name # "_WorstCase"), !cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b60d7a..9ebf278 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -143,6 +143,10 @@ public: #include "RISCVGenSubtargetInfo.inc" bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; } + bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; } + bool hasStdExtCOrZcfOrZce() const { + return HasStdExtC || HasStdExtZcf || HasStdExtZce; + } bool hasStdExtZvl() const { return ZvlLen != 0; } bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; } bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index adef40e..3e20e45 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination( static cl::opt<bool> EnableSinkFold("riscv-enable-sink-fold", cl::desc("Enable sinking and folding of instruction copies"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden, diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index e6e3560..28a63b9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> { !eq(operation, OpGroupNonUniformShuffleDown), !eq(operation, OpGroupBroadcast), !eq(operation, OpGroupNonUniformBroadcast), - !eq(operation, OpGroupNonUniformBroadcastFirst)); + !eq(operation, OpGroupNonUniformBroadcastFirst), + !eq(operation, OpGroupNonUniformRotateKHR)); bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical); } @@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>; defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>; +// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate +defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>; +defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>; + // cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>; defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>; diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index cc438b2..10569ef 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) { static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, SPIRVGlobalRegistry *GR, - MachineIRBuilder &MIRBuilder) { + MachineIRBuilder &MIRBuilder, + const SPIRVSubtarget &ST) { // Read argument's access qualifier from metadata or default. SPIRV::AccessQualifier::AccessQualifier ArgAccessQual = getArgAccessQual(F, ArgIdx); @@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, if (MDTypeStr.ends_with("*")) ResArgType = GR->getOrCreateSPIRVTypeByName( MDTypeStr, MIRBuilder, - addressSpaceToStorageClass( - OriginalArgType->getPointerAddressSpace())); + addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(), + ST)); else if (MDTypeStr.ends_with("_t")) ResArgType = GR->getOrCreateSPIRVTypeByName( "opencl." + MDTypeStr.str(), MIRBuilder, @@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, assert(GR && "Must initialize the SPIRV type registry before lowering args."); GR->setCurrentFunc(MIRBuilder.getMF()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); + // Assign types and names to all args, and store their types for later. FunctionType *FTy = getOriginalFunctionType(F); SmallVector<SPIRVType *, 4> ArgTypeVRegs; @@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (VRegs[i].size() > 1) return false; - auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder); + auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST); GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF()); ArgTypeVRegs.push_back(SpirvTy); @@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (F.hasName()) buildOpName(FuncVReg, F.getName(), MIRBuilder); - // Get access to information about available extensions - const auto *ST = - static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); - // Handle entry points and function linkage. if (isEntryPoint(F)) { const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>(); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 47fec74..a1cb630 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType( // TODO: change the implementation once opaque pointers are supported // in the SPIR-V specification. SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder); - auto SC = addressSpaceToStorageClass(PType->getAddressSpace()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); + auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST); // Null pointer means we have a loop in type definitions, make and // return corresponding OpTypeForwardPointer. if (SpvElementType == nullptr) { diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h index f317b26..d34f802 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h @@ -31,6 +31,9 @@ public: return true; } + // prevent creation of jump tables + bool areJTsAllowed(const Function *) const override { return false; } + // This is to prevent sexts of non-i64 vector indices which are generated // within general IRTranslator hence type generation for it is omitted. MVT getVectorIdxTy(const DataLayout &DL) const override { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 0f11bc3..7c5252e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor "$r = OpGenericCastToPtrExplicit $t $p $s">; def OpBitcast : UnOp<"OpBitcast", 124>; +// SPV_INTEL_usm_storage_classes +def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>; +def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>; + // 3.42.12 Composite Instructions def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx), @@ -765,6 +769,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>; def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>; def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>; +// SPV_KHR_subgroup_rotate +def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res), + (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops), + "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">; + // 3.49.7, Constant-Creation Instructions // - SPV_INTEL_function_pointers diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 53d19a1..7258d3b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) { } } +static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) { + switch (SC) { + case SPIRV::StorageClass::DeviceOnlyINTEL: + case SPIRV::StorageClass::HostOnlyINTEL: + return true; + default: + return false; + } +} + // In SPIR-V address space casting can only happen to and from the Generic -// storage class. We can also only case Workgroup, CrossWorkgroup, or Function +// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function // pointers to and from Generic pointers. As such, we can convert e.g. from // Workgroup to Function by going via a Generic pointer as an intermediary. All // other combinations can only be done by a bitcast, and are probably not safe. @@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr); SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg); - // Casting from an eligable pointer to Generic. + // don't generate a cast between identical storage classes + if (SrcSC == DstSC) + return true; + + // Casting from an eligible pointer to Generic. if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric); - // Casting from Generic to an eligable pointer. + // Casting from Generic to an eligible pointer. if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr); - // Casting between 2 eligable pointers using Generic as an intermediary. + // Casting between 2 eligible pointers using Generic as an intermediary. if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) { Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass); SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType( @@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, .addUse(Tmp) .constrainAllUses(TII, TRI, RBI); } + + // Check if instructions from the SPV_INTEL_usm_storage_classes extension may + // be applied + if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpPtrCastToCrossWorkgroupINTEL); + if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC)) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpCrossWorkgroupCastToPtrINTEL); + // TODO Should this case just be disallowed completely? // We're casting 2 other arbitrary address spaces, so have to bitcast. return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast); @@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( } SPIRVType *ResType = GR.getOrCreateSPIRVPointerType( PointerBaseType, I, TII, - addressSpaceToStorageClass(GV->getAddressSpace())); + addressSpaceToStorageClass(GV->getAddressSpace(), STI)); std::string GlobalIdent; if (!GV->hasName()) { @@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( unsigned AddrSpace = GV->getAddressSpace(); SPIRV::StorageClass::StorageClass Storage = - addressSpaceToStorageClass(AddrSpace); + addressSpaceToStorageClass(AddrSpace, STI); bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage && Storage != SPIRV::StorageClass::Function; SPIRV::LinkageType::LinkageType LnkType = diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 011a550..4f2e7a2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { const LLT p2 = LLT::pointer(2, PSize); // UniformConstant const LLT p3 = LLT::pointer(3, PSize); // Workgroup const LLT p4 = LLT::pointer(4, PSize); // Generic - const LLT p5 = LLT::pointer(5, PSize); // Input + const LLT p5 = + LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device) + const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host) // TODO: remove copy-pasting here by using concatenation in some way. auto allPtrsScalarsAndVectors = { - p0, p1, p2, p3, p4, p5, s1, s8, s16, - s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, - v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, - v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; + p0, p1, p2, p3, p4, p5, p6, s1, s8, s16, + s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16, + v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16, + v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; auto allScalarsAndVectors = { s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, @@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { auto allFloatAndIntScalars = allIntScalars; - auto allPtrs = {p0, p1, p2, p3, p4, p5}; - auto allWritablePtrs = {p0, p1, p3, p4}; + auto allPtrs = {p0, p1, p2, p3, p4, p5, p6}; + auto allWritablePtrs = {p0, p1, p3, p4, p5, p6}; for (auto Opc : TypeFoldingSupportingOpcs) getActionDefinitionsBuilder(Opc).custom(); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index dbda287..3be28c9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1063,12 +1063,28 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR); } break; + case SPIRV::OpPtrCastToCrossWorkgroupINTEL: + case SPIRV::OpCrossWorkgroupCastToPtrINTEL: + if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes); + Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL); + } + break; case SPIRV::OpConstantFunctionPointerINTEL: if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) { Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers); Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL); } break; + case SPIRV::OpGroupNonUniformRotateKHR: + if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate)) + report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the " + "following SPIR-V extension: SPV_KHR_subgroup_rotate", + false); + Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate); + Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR); + Reqs.addCapability(SPIRV::Capability::GroupNonUniform); + break; case SPIRV::OpGroupIMulKHR: case SPIRV::OpGroupFMulKHR: case SPIRV::OpGroupBitwiseAndKHR: diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index cbc16fa..1442168 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) { static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget()); SmallVector<MachineInstr *, 10> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(4).getImm())); + addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST)); // If the bitcast would be redundant, replace all uses with the source // register. @@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<MachineInstr *, 10> ToErase; @@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(3).getImm())); + addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST)); MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index e186154..79f1614 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions( clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone", "Adds OptNoneINTEL value for Function Control mask that " "indicates a request to not optimize the function."), + clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes, + "SPV_INTEL_usm_storage_classes", + "Introduces two new storage classes that are sub classes of " + "the CrossWorkgroup storage class " + "that provides additional information that can enable " + "optimization."), clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups", "Allows work items in a subgroup to share data without the " "use of local memory and work group barriers, and to " @@ -75,6 +81,10 @@ cl::list<SPIRV::Extension::Extension> Extensions( "Allows to use the LinkOnceODR linkage type that is to let " "a function or global variable to be merged with other functions " "or global variables of the same name when linkage occurs."), + clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate, + "SPV_KHR_subgroup_rotate", + "Adds a new instruction that enables rotating values across " + "invocations within a subgroup."), clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers, "SPV_INTEL_function_pointers", "Allows translation of function pointers."))); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 4e5ac0d..b022b97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>; defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>; defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>; +defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>; defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>; @@ -462,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; +defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -699,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>; defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>; defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>; defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>; +defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>; +defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Dim enum values and at the same time diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 05f766d..169d7cc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "SPIRV.h" #include "SPIRVInstrInfo.h" +#include "SPIRVSubtarget.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) { return 3; case SPIRV::StorageClass::Generic: return 4; + case SPIRV::StorageClass::DeviceOnlyINTEL: + return 5; + case SPIRV::StorageClass::HostOnlyINTEL: + return 6; case SPIRV::StorageClass::Input: return 7; default: - llvm_unreachable("Unable to get address space id"); + report_fatal_error("Unable to get address space id"); } } SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace) { +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) { switch (AddrSpace) { case 0: return SPIRV::StorageClass::Function; @@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) { return SPIRV::StorageClass::Workgroup; case 4: return SPIRV::StorageClass::Generic; + case 5: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::DeviceOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; + case 6: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::HostOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; case 7: return SPIRV::StorageClass::Input; default: - llvm_unreachable("Unknown address space"); + report_fatal_error("Unknown address space"); } } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index a33dc02..1af53dc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -27,6 +27,7 @@ class MachineRegisterInfo; class Register; class StringRef; class SPIRVInstrInfo; +class SPIRVSubtarget; // Add the given string as a series of integer operand, inserting null // terminators and padding to make sure the operands all have 32-bit @@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC); // Convert an LLVM IR address space to a SPIR-V storage class. SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace); +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI); SPIRV::MemorySemantics::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7c47790..36f0679 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -43,6 +43,8 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower" +extern cl::opt<bool> WasmEmitMultiValue; + WebAssemblyTargetLowering::WebAssemblyTargetLowering( const TargetMachine &TM, const WebAssemblySubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn( const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext & /*Context*/) const { // WebAssembly can only handle returning tuples with multivalue enabled - return Subtarget->hasMultivalue() || Outs.size() <= 1; + return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1; } SDValue WebAssemblyTargetLowering::LowerReturn( @@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn( const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - assert((Subtarget->hasMultivalue() || Outs.size() <= 1) && + assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) || + Outs.size() <= 1) && "MVP WebAssembly can only return up to one value"); if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 1e95911..b969b83 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -22,6 +22,8 @@ #include "llvm/Target/TargetMachine.h" using namespace llvm; +extern cl::opt<bool> WasmEmitMultiValue; + WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor. MachineFunctionInfo *WebAssemblyFunctionInfo::clone( @@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); if (Results.size() > 1 && - !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) { + (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() || + !WasmEmitMultiValue)) { // WebAssembly can't lower returns of multiple values without demoting to // sret unless multivalue is enabled (see // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 3e2e029..2a84c90 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -24,6 +24,8 @@ using namespace llvm; +extern cl::opt<bool> WasmEmitMultiValue; + namespace { enum RuntimeLibcallSignature { @@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_func_f32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F32); break; case i64_i64_func_f64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F64); break; case i16_i16_func_i16_i16: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i32_i32_func_i32_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_iPTR: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); @@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 42043a7..3120b6b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass( " irreducible control flow optimization pass"), cl::init(false)); +// A temporary option to control emission of multivalue until multivalue +// implementation is stable enough. We currently don't emit multivalue by +// default even if the feature section allows it. +// TODO Stabilize multivalue and delete this option +cl::opt<bool> + WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden, + cl::desc("WebAssembly: Emit multivalue in the backend"), + cl::init(false)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() { // Register the target. RegisterTargetMachine<WebAssemblyTargetMachine> X( diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index 4a11dd2..a620ba9 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const { } // namespace Error X86TargetMachine::buildCodeGenPipeline( - ModulePassManager &MPM, MachineFunctionPassManager &MFPM, - MachineFunctionAnalysisManager &, raw_pwrite_stream &Out, - raw_pwrite_stream *DwoOut, CodeGenFileType FileType, - CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) { + ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, + CodeGenFileType FileType, CGPassBuilderOption Opt, + PassInstrumentationCallbacks *PIC) { auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC); - return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType); + return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index f31c971..0fd3e47 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -58,10 +58,9 @@ public: createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; - Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &, - MachineFunctionAnalysisManager &, - raw_pwrite_stream &, raw_pwrite_stream *, - CodeGenFileType, CGPassBuilderOption, + Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &, + raw_pwrite_stream *, CodeGenFileType, + CGPassBuilderOption, PassInstrumentationCallbacks *) override; bool isJIT() const { return IsJIT; } |