diff options
Diffstat (limited to 'llvm/lib')
122 files changed, 4053 insertions, 1716 deletions
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 6bf0d2f..5916d2a 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, if (Size.getBitWidth() > 64) return false; - const uint64_t LoadSize = Size.getZExtValue(); + const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue()); // Otherwise, be a little bit aggressive by scanning the local block where we // want to check to see if the pointer is already being loaded or stored @@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, // Handle trivial cases. if (AccessedPtr == V && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; } return false; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 04f3172..653b3d4 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7194,6 +7194,21 @@ bool llvm::propagatesPoison(const Use &PoisonOp) { // corresponding lanes are poison. return true; case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::abs: + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: + case Intrinsic::bitreverse: + case Intrinsic::bswap: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::sshl_sat: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::ushl_sat: return true; } } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 515a1d0..832907a 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -3060,48 +3060,49 @@ Error BitcodeReader::parseConstants() { V = Constant::getNullValue(CurTy); break; case bitc::CST_CODE_INTEGER: // INTEGER: [intval] - if (!CurTy->isIntegerTy() || Record.empty()) + if (!CurTy->isIntOrIntVectorTy() || Record.empty()) return error("Invalid integer const record"); V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0])); break; case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval] - if (!CurTy->isIntegerTy() || Record.empty()) + if (!CurTy->isIntOrIntVectorTy() || Record.empty()) return error("Invalid wide integer const record"); - APInt VInt = - readWideAPInt(Record, cast<IntegerType>(CurTy)->getBitWidth()); - V = ConstantInt::get(Context, VInt); - + auto *ScalarTy = cast<IntegerType>(CurTy->getScalarType()); + APInt VInt = readWideAPInt(Record, ScalarTy->getBitWidth()); + V = ConstantInt::get(CurTy, VInt); break; } case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] if (Record.empty()) return error("Invalid float const record"); - if (CurTy->isHalfTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(), - APInt(16, (uint16_t)Record[0]))); - else if (CurTy->isBFloatTy()) - V = ConstantFP::get(Context, APFloat(APFloat::BFloat(), - APInt(16, (uint32_t)Record[0]))); - else if (CurTy->isFloatTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(), - APInt(32, (uint32_t)Record[0]))); - else if (CurTy->isDoubleTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(), - APInt(64, Record[0]))); - else if (CurTy->isX86_FP80Ty()) { + + auto *ScalarTy = CurTy->getScalarType(); + if (ScalarTy->isHalfTy()) + V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEhalf(), + APInt(16, (uint16_t)Record[0]))); + else if (ScalarTy->isBFloatTy()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::BFloat(), APInt(16, (uint32_t)Record[0]))); + else if (ScalarTy->isFloatTy()) + V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEsingle(), + APInt(32, (uint32_t)Record[0]))); + else if (ScalarTy->isDoubleTy()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0]))); + else if (ScalarTy->isX86_FP80Ty()) { // Bits are not stored the same way as a normal i80 APInt, compensate. uint64_t Rearrange[2]; Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16); Rearrange[1] = Record[0] >> 48; - V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(), - APInt(80, Rearrange))); - } else if (CurTy->isFP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(), - APInt(128, Record))); - else if (CurTy->isPPC_FP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(), - APInt(128, Record))); + V = ConstantFP::get( + CurTy, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange))); + } else if (ScalarTy->isFP128Ty()) + V = ConstantFP::get(CurTy, + APFloat(APFloat::IEEEquad(), APInt(128, Record))); + else if (ScalarTy->isPPC_FP128Ty()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record))); else V = UndefValue::get(CurTy); break; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 13be0b0..656f2a6 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2624,7 +2624,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, } } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { Code = bitc::CST_CODE_FLOAT; - Type *Ty = CFP->getType(); + Type *Ty = CFP->getType()->getScalarType(); if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() || Ty->isDoubleTy()) { Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 7b66a85..3b84624 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -829,11 +829,7 @@ class MemLocFragmentFill { void process(BasicBlock &BB, VarFragMap &LiveSet) { BBInsertBeforeMap[&BB].clear(); for (auto &I : BB) { - for (DbgRecord &DR : I.getDbgValueRange()) { - // FIXME: DPValue::filter usage needs attention in this file; we need - // to make sure dbg.labels are handled correctly in RemoveDIs mode. - // Cast below to ensure this gets fixed when DPLabels are introduced. - DPValue &DPV = cast<DPValue>(DR); + for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { if (const auto *Locs = FnVarLocs->getWedge(&DPV)) { for (const VarLocInfo &Loc : *Locs) { addDef(Loc, &DPV, *I.getParent(), LiveSet); @@ -1919,6 +1915,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) { // attached DPValues, or a non-debug instruction with attached unprocessed // DPValues. if (II != EI && II->hasDbgValues()) { + // Skip over non-variable debug records (i.e., labels). They're going to + // be read from IR (possibly re-ordering them within the debug record + // range) rather than from the analysis results. for (DPValue &DPV : DPValue::filter(II->getDbgValueRange())) { resetInsertionPoint(DPV); processDPValue(DPV, LiveSet); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 4036f18..feefe87 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2686,8 +2686,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, attributesPermitTailCall(F, CI, RetI, *TLI)) { // Either we return void or the return value must be the first // argument of a known intrinsic or library function. - if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && - V == CI->getArgOperand(0))) { + if (!V || isa<UndefValue>(V) || + (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && + V == CI->getArgOperand(0))) { TailCallBBs.push_back(Pred); } } diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 3bd1542..77dc265 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -187,7 +187,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, if (!lowerCall(MIRBuilder, Info)) return false; - if (ReturnHintAlignReg && !Info.IsTailCall) { + if (ReturnHintAlignReg && !Info.LoweredTailCall) { MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg, ReturnHintAlign); } diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 7c95cef..38bb808 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3275,7 +3275,17 @@ void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList, void IRTranslator::translateDbgInfo(const Instruction &Inst, MachineIRBuilder &MIRBuilder) { - for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) { + for (DbgRecord &DR : Inst.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) { + MIRBuilder.setDebugLoc(DPL->getDebugLoc()); + assert(DPL->getLabel() && "Missing label"); + assert(DPL->getLabel()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + MIRBuilder.buildDbgLabel(DPL->getLabel()); + continue; + } + DPValue &DPV = cast<DPValue>(DR); const DILocalVariable *Variable = DPV.getVariable(); const DIExpression *Expression = DPV.getExpression(); Value *V = DPV.getVariableLocationOp(0); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 26fd12f..23ad68b 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -660,8 +660,11 @@ std::optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, default: break; case TargetOpcode::G_ADD: - case TargetOpcode::G_PTR_ADD: return C1 + C2; + case TargetOpcode::G_PTR_ADD: + // Types can be of different width here. + // Result needs to be the same width as C1, so trunc or sext C2. + return C1 + C2.sextOrTrunc(C1.getBitWidth()); case TargetOpcode::G_AND: return C1 & C2; case TargetOpcode::G_ASHR: diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index 38c1c56..0ddd945 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -109,12 +109,6 @@ static bool isColdBlock(const MachineBasicBlock &MBB, const MachineBlockFrequencyInfo *MBFI, ProfileSummaryInfo *PSI) { std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB); - - // Temporary hack to cope with AArch64's jump table encoding - const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo(); - if (!TII.isMBBSafeToSplitToCold(MBB)) - return false; - // For instrumentation profiles and sample profiles, we use different ways // to judge whether a block is cold and should be split. if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) { @@ -178,7 +172,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { if (MBB.isEHPad()) LandingPads.push_back(&MBB); - else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode) + else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && + TII.isMBBSafeToSplitToCold(MBB) && !SplitAllEHCode) MBB.setSectionID(MBBSectionID::ColdSectionID); } @@ -190,7 +185,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { // Here we have UseProfileData == true. bool HasHotLandingPads = false; for (const MachineBasicBlock *LP : LandingPads) { - if (!isColdBlock(*LP, MBFI, PSI)) + if (!isColdBlock(*LP, MBFI, PSI) || !TII.isMBBSafeToSplitToCold(*LP)) HasHotLandingPads = true; } if (!HasHotLandingPads) { diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp index d42bbe2..9a750b5 100644 --- a/llvm/lib/CodeGen/MachinePassManager.cpp +++ b/llvm/lib/CodeGen/MachinePassManager.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachinePassManager.h" -#include "llvm/CodeGen/FreeMachineFunction.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/PassManagerImpl.h" @@ -19,99 +18,121 @@ using namespace llvm; namespace llvm { -template class AllAnalysesOn<MachineFunction>; + +AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key; + template class AnalysisManager<MachineFunction>; template class PassManager<MachineFunction>; +template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, + Module>; +template class OuterAnalysisManagerProxy<ModuleAnalysisManager, + MachineFunction>; + +bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate( + MachineFunction &IR, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv) { + // MachineFunction passes should not invalidate Function analyses. + // TODO: verify that PA doesn't invalidate Function analyses. + return false; +} -Error MachineFunctionPassManager::run(Module &M, - MachineFunctionAnalysisManager &MFAM) { - // MachineModuleAnalysis is a module analysis pass that is never invalidated - // because we don't run any module pass in codegen pipeline. This is very - // important because the codegen state is stored in MMI which is the analysis - // result of MachineModuleAnalysis. MMI should not be recomputed. - auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI(); - - (void)RequireCodeGenSCCOrder; - assert(!RequireCodeGenSCCOrder && "not implemented"); - - // M is unused here - PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M); - - // Add a PIC to verify machine functions. - if (VerifyMachineFunction) { - // No need to pop this callback later since MIR pipeline is flat which means - // current pipeline is the top-level pipeline. Callbacks are not used after - // current pipeline. - PI.pushBeforeNonSkippedPassCallback([](StringRef PassID, Any IR) { - assert(llvm::any_cast<const MachineFunction *>(&IR)); - const MachineFunction *MF = llvm::any_cast<const MachineFunction *>(IR); - assert(MF && "Machine function should be valid for printing"); - std::string Banner = std::string("After ") + std::string(PassID); - verifyMachineFunction(Banner, *MF); - }); +template <> +bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate( + Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &Inv) { + // If literally everything is preserved, we're done. + if (PA.areAllPreserved()) + return false; // This is still a valid proxy. + + // If this proxy isn't marked as preserved, then even if the result remains + // valid, the key itself may no longer be valid, so we clear everything. + // + // Note that in order to preserve this proxy, a module pass must ensure that + // the MFAM has been completely updated to handle the deletion of functions. + // Specifically, any MFAM-cached results for those functions need to have been + // forcibly cleared. When preserved, this proxy will only invalidate results + // cached on functions *still in the module* at the end of the module pass. + auto PAC = PA.getChecker<MachineFunctionAnalysisManagerModuleProxy>(); + if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) { + InnerAM->clear(); + return true; } - for (auto &F : InitializationFuncs) { - if (auto Err = F(M, MFAM)) - return Err; + // FIXME: be more precise, see + // FunctionAnalysisManagerModuleProxy::Result::invalidate. + if (!PA.allAnalysesInSetPreserved<AllAnalysesOn<MachineFunction>>()) { + InnerAM->clear(); + return true; } - unsigned Idx = 0; - size_t Size = Passes.size(); - do { - // Run machine module passes - for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) { - if (!PI.runBeforePass<Module>(*Passes[Idx], M)) - continue; - if (auto Err = MachineModulePasses.at(Idx)(M, MFAM)) - return Err; - PI.runAfterPass(*Passes[Idx], M, PreservedAnalyses::all()); - } - - // Finish running all passes. - if (Idx == Size) - break; - - // Run machine function passes - - // Get index range of machine function passes. - unsigned Begin = Idx; - for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx) - ; - - for (Function &F : M) { - // Do not codegen any 'available_externally' functions at all, they have - // definitions outside the translation unit. - if (F.hasAvailableExternallyLinkage()) - continue; - - MachineFunction &MF = MMI.getOrCreateMachineFunction(F); - - for (unsigned I = Begin, E = Idx; I != E; ++I) { - auto *P = Passes[I].get(); + // Return false to indicate that this result is still a valid proxy. + return false; +} - if (!PI.runBeforePass<MachineFunction>(*P, MF)) - continue; +PreservedAnalyses +ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) { + auto &MMI = AM.getResult<MachineModuleAnalysis>(M).getMMI(); + MachineFunctionAnalysisManager &MFAM = + AM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M).getManager(); + PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M); + PreservedAnalyses PA = PreservedAnalyses::all(); + for (Function &F : M) { + // Do not codegen any 'available_externally' functions at all, they have + // definitions outside the translation unit. + if (F.hasAvailableExternallyLinkage()) + continue; + + MachineFunction &MF = MMI.getOrCreateMachineFunction(F); + + if (!PI.runBeforePass<MachineFunction>(*Pass, MF)) + continue; + PreservedAnalyses PassPA = Pass->run(MF, MFAM); + if (MMI.getMachineFunction(F)) { + MFAM.invalidate(MF, PassPA); + PI.runAfterPass(*Pass, MF, PassPA); + } else { + MFAM.clear(MF, F.getName()); + PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA); + } + PA.intersect(std::move(PassPA)); + } - // TODO: EmitSizeRemarks - PreservedAnalyses PassPA = P->run(MF, MFAM); + return PA; +} - // MF is dangling after FreeMachineFunctionPass - if (P->name() != FreeMachineFunctionPass::name()) { - MFAM.invalidate(MF, PassPA); +void ModuleToMachineFunctionPassAdaptor::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + OS << "machine-function("; + Pass->printPipeline(OS, MapClassName2PassName); + OS << ')'; +} - PI.runAfterPass(*P, MF, PassPA); - } - } +template <> +PreservedAnalyses +PassManager<MachineFunction>::run(MachineFunction &MF, + AnalysisManager<MachineFunction> &MFAM) { + PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF); + Function &F = MF.getFunction(); + MachineModuleInfo &MMI = + MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF) + .getCachedResult<MachineModuleAnalysis>(*F.getParent()) + ->getMMI(); + PreservedAnalyses PA = PreservedAnalyses::all(); + for (auto &Pass : Passes) { + if (!PI.runBeforePass<MachineFunction>(*Pass, MF)) + continue; + + PreservedAnalyses PassPA = Pass->run(MF, MFAM); + if (MMI.getMachineFunction(F)) { + MFAM.invalidate(MF, PassPA); + PI.runAfterPass(*Pass, MF, PassPA); + } else { + MFAM.clear(MF, F.getName()); + PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA); } - } while (true); - - for (auto &F : FinalizationFuncs) { - if (auto Err = F(M, MFAM)) - return Err; + PA.intersect(std::move(PassPA)); } - - return Error::success(); + return PA; } } // namespace llvm diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 697e0da..1bda19b 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -768,7 +768,6 @@ static void getUnderlyingObjects(const MachineInstr *MI, Objs.clear(); return; } - Objs.push_back(V); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 89ef648..6a28bc8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -76,6 +76,8 @@ #include <utility> #include <variant> +#include "MatchContext.h" + using namespace llvm; #define DEBUG_TYPE "dagcombine" @@ -888,141 +890,6 @@ public: void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } }; -class EmptyMatchContext { - SelectionDAG &DAG; - const TargetLowering &TLI; - -public: - EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) - : DAG(DAG), TLI(TLI) {} - - bool match(SDValue OpN, unsigned Opcode) const { - return Opcode == OpN->getOpcode(); - } - - // Same as SelectionDAG::getNode(). - template <typename... ArgT> SDValue getNode(ArgT &&...Args) { - return DAG.getNode(std::forward<ArgT>(Args)...); - } - - bool isOperationLegalOrCustom(unsigned Op, EVT VT, - bool LegalOnly = false) const { - return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly); - } -}; - -class VPMatchContext { - SelectionDAG &DAG; - const TargetLowering &TLI; - SDValue RootMaskOp; - SDValue RootVectorLenOp; - -public: - VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) - : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() { - assert(Root->isVPOpcode()); - if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode())) - RootMaskOp = Root->getOperand(*RootMaskPos); - else if (Root->getOpcode() == ISD::VP_SELECT) - RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root), - Root->getOperand(0).getValueType()); - - if (auto RootVLenPos = - ISD::getVPExplicitVectorLengthIdx(Root->getOpcode())) - RootVectorLenOp = Root->getOperand(*RootVLenPos); - } - - /// whether \p OpVal is a node that is functionally compatible with the - /// NodeType \p Opc - bool match(SDValue OpVal, unsigned Opc) const { - if (!OpVal->isVPOpcode()) - return OpVal->getOpcode() == Opc; - - auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), - !OpVal->getFlags().hasNoFPExcept()); - if (BaseOpc != Opc) - return false; - - // Make sure the mask of OpVal is true mask or is same as Root's. - unsigned VPOpcode = OpVal->getOpcode(); - if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) { - SDValue MaskOp = OpVal.getOperand(*MaskPos); - if (RootMaskOp != MaskOp && - !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode())) - return false; - } - - // Make sure the EVL of OpVal is same as Root's. - if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode)) - if (RootVectorLenOp != OpVal.getOperand(*VLenPos)) - return false; - return true; - } - - // Specialize based on number of operands. - // TODO emit VP intrinsics where MaskOp/VectorLenOp != null - // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return - // DAG.getNode(Opcode, DL, VT); } - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 1 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); - return DAG.getNode(VPOpcode, DL, VT, - {Operand, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 2 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDValue N3) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 3 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, N3, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand, - SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 1 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); - return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp}, - Flags); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 2 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); - return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}, - Flags); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDValue N3, SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 3 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags); - } - - bool isOperationLegalOrCustom(unsigned Op, EVT VT, - bool LegalOnly = false) const { - unsigned VPOp = ISD::getVPForBaseOpcode(Op); - return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly); - } -}; - } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -13997,6 +13864,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) return Res; + // CSE zext nneg with sext if the zext is not free. + if (N->getFlags().hasNonNeg() && !TLI.isZExtFree(N0.getValueType(), VT)) { + SDNode *CSENode = DAG.getNodeIfExists(ISD::SIGN_EXTEND, N->getVTList(), N0); + if (CSENode) + return SDValue(CSENode, 0); + } + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 5651498..246762d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1188,11 +1188,24 @@ void FastISel::handleDbgInfo(const Instruction *II) { MIMD = MIMetadata(); // Reverse order of debug records, because fast-isel walks through backwards. - for (DbgRecord &DPR : llvm::reverse(II->getDbgValueRange())) { + for (DbgRecord &DR : llvm::reverse(II->getDbgValueRange())) { flushLocalValueMap(); recomputeInsertPt(); - DPValue &DPV = cast<DPValue>(DPR); + if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) { + assert(DPL->getLabel() && "Missing label"); + if (!FuncInfo.MF->getMMI().hasDebugInfo()) { + LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DPL << "\n"); + continue; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DPL->getDebugLoc(), + TII.get(TargetOpcode::DBG_LABEL)) + .addMetadata(DPL->getLabel()); + continue; + } + + DPValue &DPV = cast<DPValue>(DR); Value *V = nullptr; if (!DPV.hasArgList()) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a4ba261..df17d65 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -217,7 +217,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::SSUBSAT: case ISD::USUBSAT: case ISD::SSHLSAT: - case ISD::USHLSAT: Res = PromoteIntRes_ADDSUBSHLSAT(N); break; + case ISD::USHLSAT: + Res = PromoteIntRes_ADDSUBSHLSAT<EmptyMatchContext>(N); + break; + case ISD::VP_SADDSAT: + case ISD::VP_UADDSAT: + case ISD::VP_SSUBSAT: + case ISD::VP_USUBSAT: + Res = PromoteIntRes_ADDSUBSHLSAT<VPMatchContext>(N); + break; case ISD::SMULFIX: case ISD::SMULFIXSAT: @@ -934,6 +942,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT); } +template <class MatchContextClass> SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { // If the promoted type is legal, we can convert this to: // 1. ANY_EXTEND iN to iM @@ -945,11 +954,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { SDLoc dl(N); SDValue Op1 = N->getOperand(0); SDValue Op2 = N->getOperand(1); + MatchContextClass matcher(DAG, TLI, N); unsigned OldBits = Op1.getScalarValueSizeInBits(); - unsigned Opcode = N->getOpcode(); + unsigned Opcode = matcher.getRootBaseOpcode(); bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT; + // FIXME: We need vp-aware PromotedInteger functions. SDValue Op1Promoted, Op2Promoted; if (IsShift) { Op1Promoted = GetPromotedInteger(Op1); @@ -968,18 +979,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); SDValue Add = - DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); + matcher.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); + return matcher.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); } // USUBSAT can always be promoted as long as we have zero-extended the args. if (Opcode == ISD::USUBSAT) - return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, - Op2Promoted); + return matcher.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, + Op2Promoted); // Shift cannot use a min/max expansion, we can't detect overflow if all of // the bits have been shifted out. - if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) { + if (IsShift || matcher.isOperationLegal(Opcode, PromotedType)) { unsigned ShiftOp; switch (Opcode) { case ISD::SADDSAT: @@ -1002,11 +1013,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount); if (!IsShift) Op2Promoted = - DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount); + matcher.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount); SDValue Result = - DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); + matcher.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); + return matcher.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; @@ -1015,9 +1026,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); SDValue Result = - DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); - Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); - Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); + matcher.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); + Result = matcher.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); + Result = matcher.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); return Result; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9114987..3c84f67 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H +#include "MatchContext.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -355,6 +356,7 @@ private: SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_VSCALE(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + template <class MatchContextClass> SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N); SDValue PromoteIntRes_MULFIX(SDNode *N); SDValue PromoteIntRes_DIVFIX(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2a7aaf8..6074498 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -404,8 +404,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FCEIL: case ISD::FTRUNC: case ISD::FRINT: - case ISD::LRINT: - case ISD::LLRINT: case ISD::FNEARBYINT: case ISD::FROUND: case ISD::FROUNDEVEN: @@ -455,6 +453,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { Node->getValueType(0), Scale); break; } + case ISD::LRINT: + case ISD::LLRINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::VECREDUCE_ADD: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 7fc2526..90cda2a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1163,10 +1163,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: case ISD::UMAX: case ISD::VP_UMAX: - case ISD::SADDSAT: - case ISD::UADDSAT: - case ISD::SSUBSAT: - case ISD::USUBSAT: + case ISD::SADDSAT: case ISD::VP_SADDSAT: + case ISD::UADDSAT: case ISD::VP_UADDSAT: + case ISD::SSUBSAT: case ISD::VP_SSUBSAT: + case ISD::USUBSAT: case ISD::VP_USUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: case ISD::ROTL: @@ -4140,10 +4140,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: case ISD::UMAX: case ISD::VP_UMAX: - case ISD::UADDSAT: - case ISD::SADDSAT: - case ISD::USUBSAT: - case ISD::SSUBSAT: + case ISD::UADDSAT: case ISD::VP_UADDSAT: + case ISD::SADDSAT: case ISD::VP_SADDSAT: + case ISD::USUBSAT: case ISD::VP_USUBSAT: + case ISD::SSUBSAT: case ISD::VP_SSUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: case ISD::ROTL: diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h new file mode 100644 index 0000000..f965cb9 --- /dev/null +++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h @@ -0,0 +1,175 @@ +//===---------------- llvm/CodeGen/MatchContext.h --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the EmptyMatchContext class and VPMatchContext class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H +#define LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H + +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" + +using namespace llvm; + +namespace { +class EmptyMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + SDNode *Root; + +public: + EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) + : DAG(DAG), TLI(TLI), Root(Root) {} + + unsigned getRootBaseOpcode() { return Root->getOpcode(); } + bool match(SDValue OpN, unsigned Opcode) const { + return Opcode == OpN->getOpcode(); + } + + // Same as SelectionDAG::getNode(). + template <typename... ArgT> SDValue getNode(ArgT &&...Args) { + return DAG.getNode(std::forward<ArgT>(Args)...); + } + + bool isOperationLegal(unsigned Op, EVT VT) const { + return TLI.isOperationLegal(Op, VT); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly); + } +}; + +class VPMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + SDValue RootMaskOp; + SDValue RootVectorLenOp; + SDNode *Root; + +public: + VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *_Root) + : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() { + Root = _Root; + assert(Root->isVPOpcode()); + if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode())) + RootMaskOp = Root->getOperand(*RootMaskPos); + else if (Root->getOpcode() == ISD::VP_SELECT) + RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root), + Root->getOperand(0).getValueType()); + + if (auto RootVLenPos = ISD::getVPExplicitVectorLengthIdx(Root->getOpcode())) + RootVectorLenOp = Root->getOperand(*RootVLenPos); + } + + unsigned getRootBaseOpcode() { + std::optional<unsigned> Opcode = ISD::getBaseOpcodeForVP( + Root->getOpcode(), !Root->getFlags().hasNoFPExcept()); + assert(Opcode.has_value()); + return *Opcode; + } + + /// whether \p OpVal is a node that is functionally compatible with the + /// NodeType \p Opc + bool match(SDValue OpVal, unsigned Opc) const { + if (!OpVal->isVPOpcode()) + return OpVal->getOpcode() == Opc; + + auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), + !OpVal->getFlags().hasNoFPExcept()); + if (BaseOpc != Opc) + return false; + + // Make sure the mask of OpVal is true mask or is same as Root's. + unsigned VPOpcode = OpVal->getOpcode(); + if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) { + SDValue MaskOp = OpVal.getOperand(*MaskPos); + if (RootMaskOp != MaskOp && + !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode())) + return false; + } + + // Make sure the EVL of OpVal is same as Root's. + if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode)) + if (RootVectorLenOp != OpVal.getOperand(*VLenPos)) + return false; + return true; + } + + // Specialize based on number of operands. + // TODO emit VP intrinsics where MaskOp/VectorLenOp != null + // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return + // DAG.getNode(Opcode, DL, VT); } + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, + {Operand, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand, + SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags); + } + + bool isOperationLegal(unsigned Op, EVT VT) const { + unsigned VPOp = ISD::getVPForBaseOpcode(Op); + return TLI.isOperationLegal(VPOp, VT); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + unsigned VPOp = ISD::getVPForBaseOpcode(Op); + return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly); + } +}; +} // end anonymous namespace +#endif diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index add92cf..0ceda27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9044,29 +9044,6 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl, SDValue SelectionDAG::getStridedLoadVP( ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask, - SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment, - MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo, - const MDNode *Ranges, bool IsExpanding) { - assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - - MMOFlags |= MachineMemOperand::MOLoad; - assert((MMOFlags & MachineMemOperand::MOStore) == 0); - // If we don't have a PtrInfo, infer the trivial frame index case to simplify - // clients. - if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset); - - uint64_t Size = MemoryLocation::UnknownSize; - MachineFunction &MF = getMachineFunction(); - MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, - Alignment, AAInfo, Ranges); - return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask, - EVL, MemVT, MMO, IsExpanding); -} - -SDValue SelectionDAG::getStridedLoadVP( - ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL, - SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) { bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!"); @@ -9098,17 +9075,6 @@ SDValue SelectionDAG::getStridedLoadVP( return V; } -SDValue SelectionDAG::getStridedLoadVP( - EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride, - SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment, - MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo, - const MDNode *Ranges, bool IsExpanding) { - SDValue Undef = getUNDEF(Ptr.getValueType()); - return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr, - Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment, - MMOFlags, AAInfo, Ranges, IsExpanding); -} - SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, @@ -9121,18 +9087,6 @@ SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue SelectionDAG::getExtStridedLoadVP( ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain, - SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, - MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment, - MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo, - bool IsExpanding) { - SDValue Undef = getUNDEF(Ptr.getValueType()); - return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef, - Stride, Mask, EVL, PtrInfo, MemVT, Alignment, - MMOFlags, AAInfo, nullptr, IsExpanding); -} - -SDValue SelectionDAG::getExtStridedLoadVP( - ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain, SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) { SDValue Undef = getUNDEF(Ptr.getValueType()); @@ -9150,11 +9104,14 @@ SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL, auto MMOFlags = SLD->getMemOperand()->getFlags() & ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); - return getStridedLoadVP( - AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(), - Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(), - SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags, - SLD->getAAInfo(), nullptr, SLD->isExpandingLoad()); + MachineFunction &MF = getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + SLD->getPointerInfo(), MMOFlags, SLD->getMemOperand()->getSize(), + SLD->getOriginalAlign(), SLD->getAAInfo()); + return getStridedLoadVP(AM, SLD->getExtensionType(), OrigLoad.getValueType(), + DL, SLD->getChain(), Base, Offset, SLD->getStride(), + SLD->getMask(), SLD->getVectorLength(), + SLD->getMemoryVT(), MMO, SLD->isExpandingLoad()); } SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL, @@ -9193,26 +9150,6 @@ SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL, return V; } -SDValue SelectionDAG::getTruncStridedStoreVP( - SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride, - SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT, - Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo, - bool IsCompressing) { - assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - - MMOFlags |= MachineMemOperand::MOStore; - assert((MMOFlags & MachineMemOperand::MOLoad) == 0); - - if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr); - - MachineFunction &MF = getMachineFunction(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo); - return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT, - MMO, IsCompressing); -} - SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride, SDValue Mask, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e893a5b..ee600d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1241,17 +1241,30 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) { It->Expr, Vals.size() > 1, It->DL, SDNodeOrder); } } - // We must early-exit here to prevent any DPValues from being emitted below, - // as we have just emitted the debug values resulting from assignment - // tracking analysis, making any existing DPValues redundant (and probably - // less correct). - return; } + // We must skip DPValues if they've already been processed above as we + // have just emitted the debug values resulting from assignment tracking + // analysis, making any existing DPValues redundant (and probably less + // correct). We still need to process DPLabels. This does sink DPLabels + // to the bottom of the group of debug records. That sholdn't be important + // as it does so deterministcally and ordering between DPLabels and DPValues + // is immaterial (other than for MIR/IR printing). + bool SkipDPValues = DAG.getFunctionVarLocs(); // Is there is any debug-info attached to this instruction, in the form of - // DPValue non-instruction debug-info records. - for (DbgRecord &DPR : I.getDbgValueRange()) { - DPValue &DPV = cast<DPValue>(DPR); + // DbgRecord non-instruction debug-info records. + for (DbgRecord &DR : I.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) { + assert(DPL->getLabel() && "Missing label"); + SDDbgLabel *SDV = + DAG.getDbgLabel(DPL->getLabel(), DPL->getDebugLoc(), SDNodeOrder); + DAG.AddDbgLabel(SDV); + continue; + } + + if (SkipDPValues) + continue; + DPValue &DPV = cast<DPValue>(DR); DILocalVariable *Variable = DPV.getVariable(); DIExpression *Expression = DPV.getExpression(); dropDanglingDebugInfo(Variable, Expression); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a4c5167..07fb891 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const { Op = expandRoundInexactToOdd(F32, Op, dl, DAG); Op = DAG.getNode(ISD::BITCAST, dl, I32, Op); - // Extract the sign bit and exponent. - SDValue SignBitAndExponentField = DAG.getNode( - ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32)); - // Set the quiet bit. - SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField, - DAG.getConstant(0x400000, dl, I32)); + // Conversions should set NaN's quiet bit. This also prevents NaNs from + // turning into infinities. + SDValue NaN = + DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32)); // Factor in the contribution of the low 16 bits. SDValue One = DAG.getConstant(1, dl, I32); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 78f819d..9c65d85 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -510,7 +510,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() { Expected<DWARFDebugNames::AttributeEncoding> DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) { - if (*Offset >= EntriesBase) { + if (*Offset >= Offsets.EntriesBase) { return createStringError(errc::illegal_byte_sequence, "Incorrectly terminated abbreviation table."); } @@ -536,7 +536,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) { Expected<DWARFDebugNames::Abbrev> DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) { - if (*Offset >= EntriesBase) { + if (*Offset >= Offsets.EntriesBase) { return createStringError(errc::illegal_byte_sequence, "Incorrectly terminated abbreviation table."); } @@ -552,32 +552,50 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) { return Abbrev(Code, dwarf::Tag(Tag), AbbrevOffset, std::move(*AttrEncOr)); } +void llvm::findDebugNamesOffsets( + DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, uint64_t HdrSize, + dwarf::DwarfFormat Format, const DWARFDebugNames::Header &Hdr) { + uint32_t DwarfSize = (Format == llvm::dwarf::DwarfFormat::DWARF64) ? 8 : 4; + uint64_t Offset = HdrSize; + Offsets.CUsBase = Offset; + Offset += Hdr.CompUnitCount * DwarfSize; + Offset += Hdr.LocalTypeUnitCount * DwarfSize; + Offset += Hdr.ForeignTypeUnitCount * 8; + + Offsets.BucketsBase = Offset; + Offset += Hdr.BucketCount * 4; + + Offsets.HashesBase = Offset; + if (Hdr.BucketCount > 0) + Offset += Hdr.NameCount * 4; + + Offsets.StringOffsetsBase = Offset; + Offset += Hdr.NameCount * DwarfSize; + + Offsets.EntryOffsetsBase = Offset; + Offset += Hdr.NameCount * DwarfSize; + + Offset += Hdr.AbbrevTableSize; + Offsets.EntriesBase = Offset; +} + Error DWARFDebugNames::NameIndex::extract() { const DWARFDataExtractor &AS = Section.AccelSection; - uint64_t Offset = Base; - if (Error E = Hdr.extract(AS, &Offset)) + uint64_t hdrSize = Base; + if (Error E = Hdr.extract(AS, &hdrSize)) return E; const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - CUsBase = Offset; - Offset += Hdr.CompUnitCount * SectionOffsetSize; - Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize; - Offset += Hdr.ForeignTypeUnitCount * 8; - BucketsBase = Offset; - Offset += Hdr.BucketCount * 4; - HashesBase = Offset; - if (Hdr.BucketCount > 0) - Offset += Hdr.NameCount * 4; - StringOffsetsBase = Offset; - Offset += Hdr.NameCount * SectionOffsetSize; - EntryOffsetsBase = Offset; - Offset += Hdr.NameCount * SectionOffsetSize; + findDebugNamesOffsets(Offsets, hdrSize, Hdr.Format, Hdr); + + uint64_t Offset = + Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize); if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize)) return createStringError(errc::illegal_byte_sequence, "Section too small: cannot read abbreviations."); - EntriesBase = Offset + Hdr.AbbrevTableSize; + Offsets.EntriesBase = Offset + Hdr.AbbrevTableSize; for (;;) { auto AbbrevOr = extractAbbrev(&Offset); @@ -679,7 +697,7 @@ void DWARFDebugNames::Entry::dumpParentIdx( return; } - auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue(); + auto AbsoluteOffset = NameIdx->Offsets.EntriesBase + FormValue.getRawUValue(); W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset); } @@ -708,14 +726,15 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const { uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const { assert(CU < Hdr.CompUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - uint64_t Offset = CUsBase + SectionOffsetSize * CU; + uint64_t Offset = Offsets.CUsBase + SectionOffsetSize * CU; return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset); } uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const { assert(TU < Hdr.LocalTypeUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU); + uint64_t Offset = + Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU); return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset); } @@ -723,7 +742,7 @@ uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const { assert(TU < Hdr.ForeignTypeUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); uint64_t Offset = - CUsBase + + Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU; return Section.AccelSection.getU64(&Offset); } @@ -759,28 +778,28 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const { assert(0 < Index && Index <= Hdr.NameCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); uint64_t StringOffsetOffset = - StringOffsetsBase + SectionOffsetSize * (Index - 1); + Offsets.StringOffsetsBase + SectionOffsetSize * (Index - 1); uint64_t EntryOffsetOffset = - EntryOffsetsBase + SectionOffsetSize * (Index - 1); + Offsets.EntryOffsetsBase + SectionOffsetSize * (Index - 1); const DWARFDataExtractor &AS = Section.AccelSection; uint64_t StringOffset = AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset); uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize); - EntryOffset += EntriesBase; + EntryOffset += Offsets.EntriesBase; return {Section.StringSection, Index, StringOffset, EntryOffset}; } uint32_t DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const { assert(Bucket < Hdr.BucketCount); - uint64_t BucketOffset = BucketsBase + 4 * Bucket; + uint64_t BucketOffset = Offsets.BucketsBase + 4 * Bucket; return Section.AccelSection.getU32(&BucketOffset); } uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const { assert(0 < Index && Index <= Hdr.NameCount); - uint64_t HashOffset = HashesBase + 4 * (Index - 1); + uint64_t HashOffset = Offsets.HashesBase + 4 * (Index - 1); return Section.AccelSection.getU32(&HashOffset); } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index 28f0564..572628f 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -389,9 +389,25 @@ Error DWARFDebugLine::Prologue::parse( if (getVersion() >= 5) { FormParams.AddrSize = DebugLineData.getU8(Cursor); - assert((!Cursor || DebugLineData.getAddressSize() == 0 || - DebugLineData.getAddressSize() == getAddressSize()) && - "Line table header and data extractor disagree"); + const uint8_t DataAddrSize = DebugLineData.getAddressSize(); + const uint8_t PrologueAddrSize = getAddressSize(); + if (Cursor) { + if (DataAddrSize == 0) { + if (PrologueAddrSize != 4 && PrologueAddrSize != 8) { + RecoverableErrorHandler(createStringError( + errc::not_supported, + "parsing line table prologue at offset 0x%8.8" PRIx64 + ": invalid address size %" PRIu8, + PrologueOffset, PrologueAddrSize)); + } + } else if (DataAddrSize != PrologueAddrSize) { + RecoverableErrorHandler(createStringError( + errc::not_supported, + "parsing line table prologue at offset 0x%8.8" PRIx64 ": address " + "size %" PRIu8 " doesn't match architecture address size %" PRIu8, + PrologueOffset, PrologueAddrSize, DataAddrSize)); + } + } SegSelectorSize = DebugLineData.getU8(Cursor); } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 251485a..fba404c 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -292,8 +292,8 @@ static const Module *getModuleFromDPI(const DPMarker *Marker) { return M ? M->getParent() : nullptr; } -static const Module *getModuleFromDPI(const DPValue *DPV) { - return DPV->getMarker() ? getModuleFromDPI(DPV->getMarker()) : nullptr; +static const Module *getModuleFromDPI(const DbgRecord *DR) { + return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr; } static void PrintCallingConv(unsigned cc, raw_ostream &Out) { @@ -1141,12 +1141,14 @@ void SlotTracker::processFunctionMetadata(const Function &F) { void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) { if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) { CreateMetadataSlot(DPV->getVariable()); - CreateMetadataSlot(DPV->getDebugLoc()); if (DPV->isDbgAssign()) CreateMetadataSlot(DPV->getAssignID()); + } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) { + CreateMetadataSlot(DPL->getLabel()); } else { llvm_unreachable("unsupported DbgRecord kind"); } + CreateMetadataSlot(DR.getDebugLoc()); } void SlotTracker::processInstructionMetadata(const Instruction &I) { @@ -1505,16 +1507,39 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) { static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, AsmWriterContext &WriterCtx) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) { - if (CI->getType()->isIntegerTy(1)) { - Out << (CI->getZExtValue() ? "true" : "false"); - return; + Type *Ty = CI->getType(); + + if (Ty->isVectorTy()) { + Out << "splat ("; + WriterCtx.TypePrinter->print(Ty->getScalarType(), Out); + Out << " "; } - Out << CI->getValue(); + + if (Ty->getScalarType()->isIntegerTy(1)) + Out << (CI->getZExtValue() ? "true" : "false"); + else + Out << CI->getValue(); + + if (Ty->isVectorTy()) + Out << ")"; + return; } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { + Type *Ty = CFP->getType(); + + if (Ty->isVectorTy()) { + Out << "splat ("; + WriterCtx.TypePrinter->print(Ty->getScalarType(), Out); + Out << " "; + } + WriteAPFloatInternal(Out, CFP->getValueAPF()); + + if (Ty->isVectorTy()) + Out << ")"; + return; } @@ -2676,6 +2701,7 @@ public: void printInstruction(const Instruction &I); void printDPMarker(const DPMarker &DPI); void printDPValue(const DPValue &DPI); + void printDPLabel(const DPLabel &DPL); void printDbgRecord(const DbgRecord &DPI); void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle); @@ -4579,8 +4605,10 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) { void AssemblyWriter::printDbgRecord(const DbgRecord &DR) { if (auto *DPV = dyn_cast<DPValue>(&DR)) printDPValue(*DPV); + else if (auto *DPL = dyn_cast<DPLabel>(&DR)) + printDPLabel(*DPL); else - llvm_unreachable("unsupported dbg record"); + llvm_unreachable("Unexpected DbgRecord kind"); } void AssemblyWriter::printDPValue(const DPValue &Value) { @@ -4622,6 +4650,16 @@ void AssemblyWriter::printDPValue(const DPValue &Value) { Out << " }"; } +void AssemblyWriter::printDPLabel(const DPLabel &Label) { + // There's no formal representation of a DPLabel -- print purely as + // a debugging aid. + Out << " DPLabel { "; + auto WriterCtx = getContext(); + WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true); + Out << " marker @" << Label.getMarker(); + Out << " }"; +} + void AssemblyWriter::printMetadataAttachments( const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs, StringRef Separator) { @@ -4885,6 +4923,12 @@ void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST, W.printDPMarker(*this); } +void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const { + + ModuleSlotTracker MST(getModuleFromDPI(this), true); + print(ROS, MST, IsForDebug); +} + void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const { // There's no formal representation of a DPValue -- print purely as a @@ -4904,6 +4948,24 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST, W.printDPValue(*this); } +void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST, + bool IsForDebug) const { + // There's no formal representation of a DbgLabelRecord -- print purely as + // a debugging aid. + formatted_raw_ostream OS(ROS); + SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr)); + SlotTracker &SlotTable = + MST.getMachine() ? *MST.getMachine() : EmptySlotTable; + auto incorporateFunction = [&](const Function *F) { + if (F) + MST.incorporateFunction(*F); + }; + incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent() + : nullptr); + AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug); + W.printDPLabel(*this); +} + void Value::print(raw_ostream &ROS, bool IsForDebug) const { bool ShouldInitializeAllMetadata = false; if (auto *I = dyn_cast<Instruction>(this)) diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index fd51602..1907677 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) { Callee.getFnAttribute(AttrClass::getKind()); } +static bool isEqual(const Function &Caller, const Function &Callee, + const StringRef &AttrName) { + return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName); +} + /// Compute the logical AND of the attributes of the caller and the /// callee. /// diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index 06807544..6ea876f 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -81,6 +81,12 @@ void BasicBlock::convertToNewDbgValues() { continue; } + if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) { + DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc())); + DLI->eraseFromParent(); + continue; + } + if (DPVals.empty()) continue; @@ -107,16 +113,12 @@ void BasicBlock::convertFromNewDbgValues() { continue; DPMarker &Marker = *Inst.DbgMarker; - for (DbgRecord &DR : Marker.getDbgValueRange()) { - if (auto *DPV = dyn_cast<DPValue>(&DR)) - InstList.insert(Inst.getIterator(), - DPV->createDebugIntrinsic(getModule(), nullptr)); - else - llvm_unreachable("unsupported DbgRecord kind"); - } + for (DbgRecord &DR : Marker.getDbgValueRange()) + InstList.insert(Inst.getIterator(), + DR.createDebugIntrinsic(getModule(), nullptr)); Marker.eraseFromParent(); - }; + } // Assume no trailing DPValues: we could technically create them at the end // of the block, after a terminator, but this would be non-cannonical and diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a38b912..e6b92aa 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -35,6 +35,20 @@ using namespace llvm; using namespace PatternMatch; +// As set of temporary options to help migrate how splats are represented. +static cl::opt<bool> UseConstantIntForFixedLengthSplat( + "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantInt's native fixed-length vector splat support.")); +static cl::opt<bool> UseConstantFPForFixedLengthSplat( + "use-constant-fp-for-fixed-length-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantFP's native fixed-length vector splat support.")); +static cl::opt<bool> UseConstantIntForScalableSplat( + "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantInt's native scalable vector splat support.")); +static cl::opt<bool> UseConstantFPForScalableSplat( + "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantFP's native scalable vector splat support.")); + //===----------------------------------------------------------------------===// // Constant Class //===----------------------------------------------------------------------===// @@ -825,9 +839,11 @@ bool Constant::isManifestConstant() const { // ConstantInt //===----------------------------------------------------------------------===// -ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V) +ConstantInt::ConstantInt(Type *Ty, const APInt &V) : ConstantData(Ty, ConstantIntVal), Val(V) { - assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type"); + assert(V.getBitWidth() == + cast<IntegerType>(Ty->getScalarType())->getBitWidth() && + "Invalid constant for type"); } ConstantInt *ConstantInt::getTrue(LLVMContext &Context) { @@ -885,6 +901,26 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) { return Slot.get(); } +// Get a ConstantInt vector with each lane set to the same APInt. +ConstantInt *ConstantInt::get(LLVMContext &Context, ElementCount EC, + const APInt &V) { + // Get an existing value or the insertion position. + std::unique_ptr<ConstantInt> &Slot = + Context.pImpl->IntSplatConstants[std::make_pair(EC, V)]; + if (!Slot) { + IntegerType *ITy = IntegerType::get(Context, V.getBitWidth()); + VectorType *VTy = VectorType::get(ITy, EC); + Slot.reset(new ConstantInt(VTy, V)); + } + +#ifndef NDEBUG + IntegerType *ITy = IntegerType::get(Context, V.getBitWidth()); + VectorType *VTy = VectorType::get(ITy, EC); + assert(Slot->getType() == VTy); +#endif + return Slot.get(); +} + Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) { Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned); @@ -1024,6 +1060,26 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) { return Slot.get(); } +// Get a ConstantFP vector with each lane set to the same APFloat. +ConstantFP *ConstantFP::get(LLVMContext &Context, ElementCount EC, + const APFloat &V) { + // Get an existing value or the insertion position. + std::unique_ptr<ConstantFP> &Slot = + Context.pImpl->FPSplatConstants[std::make_pair(EC, V)]; + if (!Slot) { + Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics()); + VectorType *VTy = VectorType::get(EltTy, EC); + Slot.reset(new ConstantFP(VTy, V)); + } + +#ifndef NDEBUG + Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics()); + VectorType *VTy = VectorType::get(EltTy, EC); + assert(Slot->getType() == VTy); +#endif + return Slot.get(); +} + Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics(); Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative)); @@ -1036,7 +1092,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { ConstantFP::ConstantFP(Type *Ty, const APFloat &V) : ConstantData(Ty, ConstantFPVal), Val(V) { - assert(&V.getSemantics() == &Ty->getFltSemantics() && + assert(&V.getSemantics() == &Ty->getScalarType()->getFltSemantics() && "FP type Mismatch"); } @@ -1356,11 +1412,13 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) { bool isZero = C->isNullValue(); bool isUndef = isa<UndefValue>(C); bool isPoison = isa<PoisonValue>(C); + bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C); + bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C); - if (isZero || isUndef) { + if (isZero || isUndef || isSplatFP || isSplatInt) { for (unsigned i = 1, e = V.size(); i != e; ++i) if (V[i] != C) { - isZero = isUndef = isPoison = false; + isZero = isUndef = isPoison = isSplatFP = isSplatInt = false; break; } } @@ -1371,6 +1429,12 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) { return PoisonValue::get(T); if (isUndef) return UndefValue::get(T); + if (isSplatFP) + return ConstantFP::get(C->getContext(), T->getElementCount(), + cast<ConstantFP>(C)->getValue()); + if (isSplatInt) + return ConstantInt::get(C->getContext(), T->getElementCount(), + cast<ConstantInt>(C)->getValue()); // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. @@ -1384,6 +1448,16 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) { Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { if (!EC.isScalable()) { + // Maintain special handling of zero. + if (!V->isNullValue()) { + if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V)) + return ConstantInt::get(V->getContext(), EC, + cast<ConstantInt>(V)->getValue()); + if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V)) + return ConstantFP::get(V->getContext(), EC, + cast<ConstantFP>(V)->getValue()); + } + // If this splat is compatible with ConstantDataVector, use it instead of // ConstantVector. if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) && @@ -1394,6 +1468,16 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { return get(Elts); } + // Maintain special handling of zero. + if (!V->isNullValue()) { + if (UseConstantIntForScalableSplat && isa<ConstantInt>(V)) + return ConstantInt::get(V->getContext(), EC, + cast<ConstantInt>(V)->getValue()); + if (UseConstantFPForScalableSplat && isa<ConstantFP>(V)) + return ConstantFP::get(V->getContext(), EC, + cast<ConstantFP>(V)->getValue()); + } + Type *VTy = VectorType::get(V->getType(), EC); if (V->isNullValue()) diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index eb18be5..389bac4 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -64,6 +64,9 @@ void DbgRecord::deleteRecord() { case ValueKind: delete cast<DPValue>(this); return; + case LabelKind: + delete cast<DPLabel>(this); + return; } llvm_unreachable("unsupported DbgRecord kind"); } @@ -73,6 +76,9 @@ void DbgRecord::print(raw_ostream &O, bool IsForDebug) const { case ValueKind: cast<DPValue>(this)->print(O, IsForDebug); return; + case LabelKind: + cast<DPLabel>(this)->print(O, IsForDebug); + return; }; llvm_unreachable("unsupported DbgRecord kind"); } @@ -83,6 +89,9 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST, case ValueKind: cast<DPValue>(this)->print(O, MST, IsForDebug); return; + case LabelKind: + cast<DPLabel>(this)->print(O, MST, IsForDebug); + return; }; llvm_unreachable("unsupported DbgRecord kind"); } @@ -93,16 +102,23 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const { switch (RecordKind) { case ValueKind: return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R)); + case LabelKind: + return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel(); }; llvm_unreachable("unsupported DbgRecord kind"); } bool DbgRecord::isEquivalentTo(const DbgRecord &R) const { - if (RecordKind != R.RecordKind) - return false; + return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R); +} + +DbgInfoIntrinsic * +DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { switch (RecordKind) { case ValueKind: - return cast<DPValue>(this)->isEquivalentTo(*cast<DPValue>(&R)); + return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore); + case LabelKind: + return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore); }; llvm_unreachable("unsupported DbgRecord kind"); } @@ -307,12 +323,16 @@ DbgRecord *DbgRecord::clone() const { switch (RecordKind) { case ValueKind: return cast<DPValue>(this)->clone(); + case LabelKind: + return cast<DPLabel>(this)->clone(); }; llvm_unreachable("unsupported DbgRecord kind"); } DPValue *DPValue::clone() const { return new DPValue(*this); } +DPLabel *DPLabel::clone() const { return new DPLabel(Label, getDebugLoc()); } + DbgVariableIntrinsic * DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { [[maybe_unused]] DICompileUnit *Unit = @@ -368,6 +388,20 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { return DVI; } +DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M, + Instruction *InsertBefore) const { + auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label); + Value *Args[] = { + MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())}; + DbgLabelInst *DbgLabel = cast<DbgLabelInst>( + CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args)); + DbgLabel->setTailCall(); + DbgLabel->setDebugLoc(getDebugLoc()); + if (InsertBefore) + DbgLabel->insertBefore(InsertBefore); + return DbgLabel; +} + Value *DPValue::getAddress() const { auto *MD = getRawAddress(); if (auto *V = dyn_cast<ValueAsMetadata>(MD)) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index ce0df53..fc5c9b2 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3525,6 +3525,7 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, "Invalid cast"); unsigned SrcBits = C->getType()->getScalarSizeInBits(); unsigned DstBits = Ty->getScalarSizeInBits(); + assert((C->getType() == Ty || SrcBits != DstBits) && "Invalid cast"); Instruction::CastOps opcode = (SrcBits == DstBits ? Instruction::BitCast : (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt)); diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 15c90a4..a0bf9ca 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -119,7 +119,9 @@ LLVMContextImpl::~LLVMContextImpl() { IntZeroConstants.clear(); IntOneConstants.clear(); IntConstants.clear(); + IntSplatConstants.clear(); FPConstants.clear(); + FPSplatConstants.clear(); CDSConstants.clear(); // Destroy attribute node lists. diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 6a20291..2ee1080 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1488,8 +1488,12 @@ public: DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntZeroConstants; DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntOneConstants; DenseMap<APInt, std::unique_ptr<ConstantInt>> IntConstants; + DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantInt>> + IntSplatConstants; DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants; + DenseMap<std::pair<ElementCount, APFloat>, std::unique_ptr<ConstantFP>> + FPSplatConstants; FoldingSet<AttributeImpl> AttrsSet; FoldingSet<AttributeListImpl> AttrsLists; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 7b3a759..6cfe677 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -330,8 +330,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, report_fatal_error(Twine("unable to parse pass pipeline description '") + Conf.OptPipeline + "': " + toString(std::move(Err))); } - } else if (Conf.UseDefaultPipeline) { - MPM.addPass(PB.buildPerModuleDefaultPipeline(OL)); } else if (IsThinLTO) { MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary)); } else { diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp index cb20fef..635cd83 100644 --- a/llvm/lib/Object/SymbolSize.cpp +++ b/llvm/lib/Object/SymbolSize.cpp @@ -65,6 +65,13 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) { return Ret; } + if (const auto *E = dyn_cast<WasmObjectFile>(&O)) { + for (SymbolRef Sym : E->symbols()) { + Ret.push_back({Sym, E->getSymbolSize(Sym)}); + } + return Ret; + } + // Collect sorted symbol addresses. Include dummy addresses for the end // of each section. std::vector<SymEntry> Addresses; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f26d95a..fed7a14 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -91,6 +91,7 @@ #include "llvm/CodeGen/JMCInstrumenter.h" #include "llvm/CodeGen/LowerEmuTLS.h" #include "llvm/CodeGen/MIRPrinter.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/SafeStack.h" #include "llvm/CodeGen/SelectOptimize.h" #include "llvm/CodeGen/ShadowStackGCLowering.h" @@ -1260,6 +1261,28 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) { } template <typename CallbacksT> +static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) { + // Explicitly handle pass manager names. + if (Name == "machine-function") + return true; + + // Explicitly handle custom-parsed pass names. + if (parseRepeatPassName(Name)) + return true; + +#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) \ + return true; +#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ + if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">") \ + return true; + +#include "llvm/Passes/MachinePassRegistry.def" + + return callbacksAcceptPassName<MachineFunctionPassManager>(Name, Callbacks); +} + +template <typename CallbacksT> static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks, bool &UseMemorySSA) { UseMemorySSA = false; @@ -1394,6 +1417,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); return Error::success(); } + if (Name == "machine-function") { + MachineFunctionPassManager MFPM; + if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline)) + return Err; + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + return Error::success(); + } if (auto Params = parseFunctionPipelineName(Name)) { if (Params->second) return make_error<StringError>( @@ -1874,8 +1904,8 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM, } #include "llvm/Passes/MachinePassRegistry.def" - for (auto &C : MachinePipelineParsingCallbacks) - if (C(Name, MFPM)) + for (auto &C : MachineFunctionPipelineParsingCallbacks) + if (C(Name, MFPM, E.InnerPipeline)) return Error::success(); return make_error<StringError>( formatv("unknown machine pass '{0}'", Name).str(), @@ -1942,7 +1972,8 @@ Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM, CGSCCAnalysisManager &CGAM, - ModuleAnalysisManager &MAM) { + ModuleAnalysisManager &MAM, + MachineFunctionAnalysisManager *MFAM) { MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); }); MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); }); CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); }); @@ -1950,6 +1981,14 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); }); FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); }); LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); + if (MFAM) { + MAM.registerPass( + [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); }); + MFAM->registerPass( + [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); }); + MFAM->registerPass( + [&] { return FunctionAnalysisManagerMachineFunctionProxy(FAM); }); + } } Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, @@ -1991,6 +2030,9 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM, UseMemorySSA)) { Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop", std::move(*Pipeline)}}}}; + } else if (isMachineFunctionPassName( + FirstName, MachineFunctionPipelineParsingCallbacks)) { + Pipeline = {{"machine-function", std::move(*Pipeline)}}; } else { for (auto &C : TopLevelPipelineParsingCallbacks) if (C(MPM, *Pipeline)) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 142bd50..17b55b6 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -247,7 +247,7 @@ static cl::opt<bool> static cl::opt<bool> EnableJumpTableToSwitch( "enable-jump-table-to-switch", - cl::desc("Enable JumpTableToSwitch pass (default = off)")); + cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true)); // This option is used in simplifying testing SampleFDO optimizations for // profile loading. diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index c62582a..a99856d 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // name (emitting the definition) can grab it from the metadata. // // FIXME: Handle functions with weak linkage? - if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) { + if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional<std::string> MangledName = getArm64ECMangledFunctionName(F.getName().str())) { F.setMetadata("arm64ec_unmangled_name", diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 5b5ffd7..4fa719a 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { TS->emitDirectiveVariantPCS(CurrentFnSym); } - if (TM.getTargetTriple().isWindowsArm64EC()) { + if (TM.getTargetTriple().isWindowsArm64EC() && + !MF->getFunction().hasLocalLinkage()) { // For ARM64EC targets, a function definition's name is mangled differently // from the normal symbol. We emit the alias from the unmangled symbol to // mangled symbol name here. diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3485edb..5cc612e 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone", cl::desc("enable use of redzone on AArch64"), cl::init(false), cl::Hidden); -static cl::opt<bool> - ReverseCSRRestoreSeq("reverse-csr-restore-seq", - cl::desc("reverse the CSR restore sequence"), - cl::init(false), cl::Hidden); - static cl::opt<bool> StackTaggingMergeSetTag( "stack-tagging-merge-settag", cl::desc("merge settag instruction in function epilog"), cl::init(true), @@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; if (!EnableHomogeneousPrologEpilog) return false; - if (ReverseCSRRestoreSeq) - return false; if (EnableRedZone) return false; @@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Define); + } + return true; + } + + // For performance reasons restore SVE register in increasing order + auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); + std::reverse(PPRBegin, PPREnd); + auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); + std::reverse(ZPRBegin, ZPREnd); + + for (const RegPairInfo &RPI : RegPairs) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -3191,42 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); - - return MIB->getIterator(); - }; - - // SVE objects are always restored in reverse order. - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (RPI.isScalable()) - EmitMI(RPI); - - if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) - .setMIFlag(MachineInstr::FrameDestroy); - for (auto &RPI : RegPairs) { - MIB.addReg(RPI.Reg1, RegState::Define); - MIB.addReg(RPI.Reg2, RegState::Define); - } - return true; - } - - if (ReverseCSRRestoreSeq) { - MachineBasicBlock::iterator First = MBB.end(); - for (const RegPairInfo &RPI : reverse(RegPairs)) { - if (RPI.isScalable()) - continue; - MachineBasicBlock::iterator It = EmitMI(RPI); - if (First == MBB.end()) - First = It; - } - if (First != MBB.end()) - MBB.splice(MBBI, &MBB, First); - } else { - for (const RegPairInfo &RPI : RegPairs) { - if (RPI.isScalable()) - continue; - (void)EmitMI(RPI); - } } return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 184ebc1..3b92e95 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); @@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); - setOperationAction(ISD::BITCAST, MVT::i16, Custom); - setOperationAction(ISD::BITCAST, MVT::f16, Custom); - setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + if (Subtarget->hasFPARMv8()) { + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 436b21f..bec1348 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1308,6 +1308,8 @@ private: bool preferScalarizeSplat(SDNode *N) const override; unsigned getMinimumJumpTableEntries() const override; + + bool softPromoteHalfType() const override { return true; } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 0ae9a69..1c577a2 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; -def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 961dded..ef7c517 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackSafetyAnalysis.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto &I : SInfo.AllocasToInstrument) { memtag::AllocaInfo &Info = I.second; assert(Info.AI && SIB.isInterestingAlloca(*Info.AI)); - TrackingVH<Instruction> OldAI = Info.AI; memtag::alignAndPadAlloca(Info, kTagGranuleSize); AllocaInst *AI = Info.AI; int Tag = NextTag; @@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { ConstantInt::get(IRB.getInt64Ty(), Tag)}); if (Info.AI->hasName()) TagPCall->setName(Info.AI->getName() + ".tag"); - Info.AI->replaceAllUsesWith(TagPCall); + // Does not replace metadata, so we don't have to handle DPValues. + Info.AI->replaceNonMetadataUsesWith(TagPCall); TagPCall->setOperand(0, Info.AI); // Calls to functions that may return twice (e.g. setjmp) confuse the @@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { for (auto *II : Info.LifetimeEnd) II->eraseFromParent(); } - - // Fixup debug intrinsics to point to the new alloca. - for (auto *DVI : Info.DbgVariableIntrinsics) - DVI->replaceVariableLocationOp(OldAI, Info.AI); - for (auto *DPV : Info.DbgVariableRecords) - DPV->replaceVariableLocationOp(OldAI, Info.AI); } // If we have instrumented at least one alloca, all unrecognized lifetime diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6655931..010e569 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); + case ISD::FREM: + // Pass nullptr as fmod/fmodf calls are emitted by the backend even when + // those functions are not declared in the module. + if (!Ty->isVectorTy()) + return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b9411e2..9218760 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule< [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; +def fdiv_by_sqrt_to_rsq_f16 : GICombineRule< + (defs root:$root), + (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)), + (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root, + [{ return matchFDivSqrtToRsqF16(*${root}); }]), + (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>; def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">; @@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> { + rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 0d3b158..13d7510 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, return true; } -static const unsigned SPDenormModeBitField = - AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); +static constexpr unsigned SPDenormModeBitField = + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2); // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions // to enable denorm mode. When 'Enable' is false, disable denorm mode. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index a1c34e9..82e17dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -83,6 +83,9 @@ public: matchRcpSqrtToRsq(MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) const; + bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; + void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( return false; } +bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( + MachineInstr &MI) const { + Register Sqrt = MI.getOperand(2).getReg(); + return MRI.hasOneNonDBGUse(Sqrt); +} + +void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( + MachineInstr &MI, const Register &X) const { + Register Dst = MI.getOperand(0).getReg(); + Register Y = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + uint32_t Flags = MI.getFlags(); + Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy}) + .addUse(X) + .setMIFlags(Flags) + .getReg(0); + B.buildFMul(Dst, RSQ, Y, Flags); + MI.eraseFromParent(); +} + bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { Register SrcReg = MI.getOperand(1).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5b32b34..b7b471d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { if (trySkipId("hwreg", AsmToken::LParen)) { OperandInfoTy HwReg(OPR_ID_UNKNOWN); - OperandInfoTy Offset(OFFSET_DEFAULT_); - OperandInfoTy Width(WIDTH_DEFAULT_); + OperandInfoTy Offset(HwregOffset::Default); + OperandInfoTy Width(HwregSize::Default); if (parseHwregBody(HwReg, Offset, Width) && validateHwreg(HwReg, Offset, Width)) { - ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id); + ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id); } else { return ParseStatus::Failure; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 894607d..e1cca17 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); } +static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr, + const MCDisassembler *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeDpp8FI(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ uint64_t /*Addr*/, \ @@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) { return DecoderUInt128(Lo, Hi); } -// The disassembler is greedy, so we need to check FI operand value to -// not parse a dpp if the correct literal is not set. For dpp16 the -// autogenerated decoder checks the dpp literal -static bool isValidDPP8(const MCInst &MI) { - using namespace llvm::AMDGPU::DPP; - int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); - assert(FiIdx != -1); - if ((unsigned)FiIdx >= MI.getNumOperands()) - return false; - unsigned Fi = MI.getOperand(FiIdx).getImm(); - return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; -} - DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes_, uint64_t Address, @@ -460,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); - DecodeStatus Res = MCDisassembler::Fail; + // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless + // there are fewer bytes left). This will be overridden on success. + Size = std::min((size_t)4, Bytes_.size()); + do { // ToDo: better to switch encoding length using some bit predicate // but it is unknown yet, so try all we can @@ -469,222 +465,147 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); - Res = - tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696, - MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - Res = - tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - const auto convertVOPDPP = [&]() { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) { - convertVOPCDPPInst(MI); // Special VOP3 case - } else { - assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); - convertVOP3DPPInst(MI); // Regular VOP3 case - } - }; - Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696, - MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); - break; - } - Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); - break; - } - Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); - if (Res) + + if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI, + DecW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI, + DecW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS)) break; } + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); - if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { - Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS); - if (Res) { - if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) - == -1) - break; - if (convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - } - } - - Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - Res = tryDecodeInst(DecoderTableDPP8GFX1164, - DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) - break; - MI = MCInst(); // clear - - Res = tryDecodeInst(DecoderTableDPP8GFX1264, - DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) && + tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS)) break; - MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); - if (Res) break; - - Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664, - MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); - break; - } - - Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664, - MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); + if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) && + tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS)) break; - } - - if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) { - Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS); - if (Res) - break; - } // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and // v_mad_mixhi_f16 for FMA variants. Try to decode using this special // table first so we print the correct name. - if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) { - Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) && + tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) { - Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) && + tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { - Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS); - if (Res) - break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) && + tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, - QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, + Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, - QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, + Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS)) break; - Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS)) break; } - // Reinitialize Bytes as DPP64 could have eaten too much + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); // Try decode 32-bit instruction - if (Bytes.size() < 4) break; - const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS); - if (Res) break; + if (Bytes.size() >= 4) { + const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS)) + break; - if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { - Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS); - if (Res) + if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS)) break; - } - if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { - Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS); - if (Res) break; - } + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) && + tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS)) + break; + + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) && + tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW, - Address, CS); - if (Res) break; + if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW, + Address, CS)) + break; - Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, - Address, CS); + if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, + Address, CS)) + break; + } + + return MCDisassembler::Fail; } while (false); - if (Res && AMDGPU::isMAC(MI.getOpcode())) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) { + if (isMacDPP(MI)) + convertMacDPPInst(MI); + + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + convertVOP3PDPPInst(MI); + else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) || + AMDGPU::isVOPC64DPP(MI.getOpcode())) + convertVOPCDPPInst(MI); // Special VOP3 case + else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) != + -1) + convertDPP8Inst(MI); + else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) + convertVOP3DPPInst(MI); // Regular VOP3 case + } + + if (AMDGPU::isMAC(MI.getOpcode())) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } - if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || - MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && + if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) { + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) { int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cpol); if (CPolPos != -1) { @@ -700,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && - (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) { + if ((MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && + (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) { // GFX90A lost TFE, its place is occupied by ACC. int TFEOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); @@ -713,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) { + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) { int SWZOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); if (SWZOpIdx != -1) { @@ -724,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int RsrcIdx = @@ -732,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1; if (VAddr0Idx >= 0 && NSAArgs > 0) { unsigned NSAWords = (NSAArgs + 3) / 4; - if (Bytes.size() < 4 * NSAWords) { - Res = MCDisassembler::Fail; - } else { - for (unsigned i = 0; i < NSAArgs; ++i) { - const unsigned VAddrIdx = VAddr0Idx + 1 + i; - auto VAddrRCID = - MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; - MI.insert(MI.begin() + VAddrIdx, - createRegOperand(VAddrRCID, Bytes[i])); - } - Bytes = Bytes.slice(4 * NSAWords); + if (Bytes.size() < 4 * NSAWords) + return MCDisassembler::Fail; + for (unsigned i = 0; i < NSAArgs; ++i) { + const unsigned VAddrIdx = VAddr0Idx + 1 + i; + auto VAddrRCID = + MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; + MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i])); } + Bytes = Bytes.slice(4 * NSAWords); } - if (Res) - Res = convertMIMGInst(MI); + convertMIMGInst(MI); } - if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))) - Res = convertMIMGInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)) + convertMIMGInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)) - Res = convertEXPInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP) + convertEXPInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)) - Res = convertVINTERPInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP) + convertVINTERPInst(MI); - if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)) - Res = convertSDWAInst(MI); + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA) + convertSDWAInst(MI); int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); @@ -782,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, int ImmLitIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm); bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK; - if (Res && ImmLitIdx != -1 && !IsSOPK) - Res = convertFMAanyK(MI, ImmLitIdx); + if (ImmLitIdx != -1 && !IsSOPK) + convertFMAanyK(MI, ImmLitIdx); - // if the opcode was not recognized we'll assume a Size of 4 bytes - // (unless there are fewer bytes left) - Size = Res ? (MaxInstBytesNum - Bytes.size()) - : std::min((size_t)4, Bytes_.size()); - return Res; + Size = MaxInstBytesNum - Bytes.size(); + return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) { // The MCInst still has these fields even though they are no longer encoded // in the GFX11 instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm); insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr); } - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || @@ -815,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { // instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); } - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { +void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { if (STI.hasFeature(AMDGPU::FeatureGFX9) || STI.hasFeature(AMDGPU::FeatureGFX10)) { if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst)) @@ -835,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); } } - return MCDisassembler::Success; } struct VOPModifiers { @@ -939,56 +850,40 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { AMDGPU::OpName::src2_modifiers); } -// We must check FI == literal to reject not genuine dpp8 insts, and we must -// first add optional MI operands to check FI -DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { +void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); - if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || - AMDGPU::isVOPC64DPP(Opc)) { - convertVOPCDPPInst(MI); - } else { - if (isMacDPP(MI)) - convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); - int VDstInIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); - if (VDstInIdx != -1) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); - if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || - MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + convertTrue16OpSel(MI); + auto Mods = collectVOPModifiers(MI); + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), + AMDGPU::OpName::op_sel); + } else { + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); - unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { - convertTrue16OpSel(MI); - auto Mods = collectVOPModifiers(MI); - insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), - AMDGPU::OpName::op_sel); - } else { - // Insert dummy unused src modifiers. - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src0_modifiers); - - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src1_modifiers); - } + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); } - return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; } -DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { - if (isMacDPP(MI)) - convertMacDPPInst(MI); - +void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { convertTrue16OpSel(MI); int VDstInIdx = @@ -1008,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), AMDGPU::OpName::op_sel); } - return MCDisassembler::Success; } // Note that before gfx10, the MIMG encoding provided no information about // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. -DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { +void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { auto TSFlags = MCII->get(MI.getOpcode()).TSFlags; int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), @@ -1043,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (BaseOpcode->BVH) { // Add A16 operand for intersect_ray instructions addOperand(MI, MCOperand::createImm(BaseOpcode->A16)); - return MCDisassembler::Success; + return; } bool IsAtomic = (VDstIdx != -1); @@ -1078,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) { // The NSA encoding does not contain enough operands for the // combination of base opcode / dimension. Should this be an error? - return MCDisassembler::Success; + return; } IsPartialNSA = true; } @@ -1097,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { DstSize += 1; if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) - return MCDisassembler::Success; + return; int NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize); if (NewOpcode == -1) - return MCDisassembler::Success; + return; // Widen the register to the correct number of enabled channels. unsigned NewVdata = AMDGPU::NoRegister; @@ -1119,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (NewVdata == AMDGPU::NoRegister) { // It's possible to encode this such that the low register + enabled // components exceeds the register count. - return MCDisassembler::Success; + return; } } @@ -1137,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &MRI.getRegClass(AddrRCID)); if (!NewVAddrSA) - return MCDisassembler::Success; + return; } MI.setOpcode(NewOpcode); @@ -1158,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MI.erase(MI.begin() + VAddr0Idx + AddrSize, MI.begin() + VAddr0Idx + Info->VAddrDwords); } - - return MCDisassembler::Success; } // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen // decoder only adds to src_modifiers, so manually add the bits to the other // operands. -DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); auto Mods = collectVOPModifiers(MI, true); @@ -1190,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi)) insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi), AMDGPU::OpName::neg_hi); - - return MCDisassembler::Success; } // Create dummy old operand and insert optional operands -DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { +void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); @@ -1212,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src1_modifiers); - return MCDisassembler::Success; } -DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, - int ImmLitIdx) const { +void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const { assert(HasLiteral && "Should have decoded a literal"); const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); unsigned DescNumOps = Desc.getNumOperands(); @@ -1232,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, IsDeferredOp) Op.setImm(Literal); } - return MCDisassembler::Success; } const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { @@ -1831,6 +1718,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { return decodeSrcOp(OPW32, Val); } +MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { + if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1) + return MCOperand(); + return MCOperand::createImm(Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3142b8a..2e1b6fb 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -194,15 +194,15 @@ public: DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer, raw_string_ostream &KdStream) const; - DecodeStatus convertEXPInst(MCInst &MI) const; - DecodeStatus convertVINTERPInst(MCInst &MI) const; - DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const; - DecodeStatus convertSDWAInst(MCInst &MI) const; - DecodeStatus convertDPP8Inst(MCInst &MI) const; - DecodeStatus convertMIMGInst(MCInst &MI) const; - DecodeStatus convertVOP3DPPInst(MCInst &MI) const; - DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; - DecodeStatus convertVOPCDPPInst(MCInst &MI) const; + void convertEXPInst(MCInst &MI) const; + void convertVINTERPInst(MCInst &MI) const; + void convertFMAanyK(MCInst &MI, int ImmLitIdx) const; + void convertSDWAInst(MCInst &MI) const; + void convertDPP8Inst(MCInst &MI) const; + void convertMIMGInst(MCInst &MI) const; + void convertVOP3DPPInst(MCInst &MI) const; + void convertVOP3PDPPInst(MCInst &MI) const; + void convertVOPCDPPInst(MCInst &MI) const; void convertMacDPPInst(MCInst &MI) const; void convertTrue16OpSel(MCInst &MI) const; @@ -261,6 +261,7 @@ public: MCOperand decodeBoolReg(unsigned Val) const; MCOperand decodeSplitBarrier(unsigned Val) const; + MCOperand decodeDpp8FI(unsigned Val) const; int getTTmpIdx(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a727134..00fa93c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) { static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); - return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; + return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); } ScheduleHazardRecognizer::HazardType diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a45fea6..a32be1e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Id; - unsigned Offset; - unsigned Width; - using namespace llvm::AMDGPU::Hwreg; unsigned Val = MI->getOperand(OpNo).getImm(); - decodeHwreg(Val, Id, Offset, Width); + auto [Id, Offset, Width] = HwregEncoding::decode(Val); StringRef HwRegName = getHwreg(Id, STI); O << "hwreg("; @@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, } else { O << Id; } - if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) { + if (Width != HwregSize::Default || Offset != HwregOffset::Default) O << ", " << Offset << ", " << Width; - } O << ')'; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 98310c3..0b516bf 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0] ID_SQ_PERF_SNAPSHOT_DATA1 = 22, ID_SQ_PERF_SNAPSHOT_PC_LO = 23, ID_SQ_PERF_SNAPSHOT_PC_HI = 24, - - ID_SHIFT_ = 0, - ID_WIDTH_ = 6, - ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) }; enum Offset : unsigned { // Offset, (5) [10:6] - OFFSET_DEFAULT_ = 0, - OFFSET_SHIFT_ = 6, - OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), - OFFSET_MEM_VIOL = 8, }; -enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] - WIDTH_M1_DEFAULT_ = 31, - WIDTH_M1_SHIFT_ = 11, - WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), -}; - -// Some values from WidthMinusOne mapped into Width domain. -enum Width : unsigned { - WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1, -}; - enum ModeRegisterMasks : uint32_t { FP_ROUND_MASK = 0xf << 0, // Bits 0..3 FP_DENORM_MASK = 0xf << 4, // Bits 4..7 diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index d02aee7..4f106bf 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( .addImm(0); Addc->getOperand(3).setIsDead(); // Mark SCC as dead. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). - addReg(FlatScrInitLo). - addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | - (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). - addReg(FlatScrInitHi). - addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | - (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + using namespace AMDGPU::Hwreg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) + .addReg(FlatScrInitLo) + .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32))); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) + .addReg(FlatScrInitHi) + .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32))); return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 257dff6..d8f528d8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, assert(Op.getValueType() == MVT::i32); uint32_t BothRoundHwReg = - AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); SDValue IntrinID = @@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock::iterator I = LoopBB->end(); - const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( - AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); // Clear TRAP_STS.MEM_VIOL BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) @@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // Otherwise there was overflow and the result is hi2:0. In both cases the // result should represent the actual time at some point during the sequence // of three getregs. + using namespace AMDGPU::Hwreg; Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) - .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, - 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) - .addImm( - AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) - .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, - 0, 32)); + .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(RegHi1) .addReg(RegHi2); @@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // FIXME: This could be predicates on the immediate, but tablegen doesn't // allow you to have a no side effect instruction in the output of a // sideeffecting pattern. - unsigned ID, Offset, Width; - AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); + auto [ID, Offset, Width] = + AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); if (ID != AMDGPU::Hwreg::ID_MODE) return BB; @@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags); - const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | - (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + using namespace AMDGPU::Hwreg; + const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); const MachineFunction &MF = DAG.getMachineFunction(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6ecb1c8..a6184c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -480,6 +480,10 @@ public: // WaitEventType to corresponding counter values in InstCounterType. virtual const unsigned *getWaitEventMask() const = 0; + // Returns a new waitcnt with all counters except VScnt set to 0. If + // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; + virtual ~WaitcntGenerator() = default; }; @@ -516,6 +520,8 @@ public: return WaitEventMaskForInstPreGFX12; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { @@ -549,6 +555,8 @@ public: return WaitEventMaskForInstGFX12Plus; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( return Modified; } +AMDGPU::Waitcnt +WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); +} + +AMDGPU::Waitcnt +WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); +} + /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that /// were added by previous passes. Currently this pass conservatively @@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly @@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); + Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); if (ForceEmitWaitcnt[LOAD_CNT]) Wait.LoadCnt = 0; @@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything ScoreBrackets->applyWaitcnt( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else { // May need to way wait for anything. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 97c7237..34cdb09 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC { } class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1, - string ConvertMethod = "nullptr"> - : CustomOperand<Type, Optional, NAME> { + string name = NAME, string ConvertMethod = "nullptr"> + : CustomOperand<Type, Optional, name> { let ParserMethod = "[this](OperandVector &Operands) -> ParseStatus { "# "return parseIntWithPrefix(\""#Prefix#"\", Operands, "# @@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in { def DppRowMask : NamedIntOperand<i32, "row_mask">; def DppBankMask : NamedIntOperand<i32, "bank_mask">; } -def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, +def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl", "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">; -def DppFI : NamedIntOperand<i32, "fi">; + +let DecoderMethod = "decodeDpp8FI" in +def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">; +def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">; def blgp : CustomOperand<i32, 1, "BLGP">; def CBSZ : NamedIntOperand<i32, "cbsz">; @@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, @@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> { @@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } // Ins for SDWA diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index e62ad02..c01b126 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask); unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset); unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + using namespace AMDGPU::Hwreg; BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32)) .addImm(Value) - .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | - (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + .addImm(HwregEncoding::encode(ID_MODE, Offset, Width)); ++NumSetregInserted; Changed = true; InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); @@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, // as we assume it has been inserted by a higher authority (this is // likely to be a very rare occurrence). unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); - if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != - AMDGPU::Hwreg::ID_MODE) + using namespace AMDGPU::Hwreg; + auto [Id, Offset, Width] = HwregEncoding::decode(Dst); + if (Id != ID_MODE) continue; - unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> - AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + - 1; - unsigned Offset = - (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset; // If an InsertionPoint is set we will insert a setreg there. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index dacdf7b..ce91e05 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) { return (Idx < 0) ? Idx : Opr[Idx].Encoding; } -bool isValidHwreg(int64_t Id) { - return 0 <= Id && isUInt<ID_WIDTH_>(Id); -} +bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); } bool isValidHwregOffset(int64_t Offset) { - return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset); + return 0 <= Offset && isUInt<HwregOffset::Width>(Offset); } bool isValidHwregWidth(int64_t Width) { - return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1); -} - -uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { - return (Id << ID_SHIFT_) | - (Offset << OFFSET_SHIFT_) | - ((Width - 1) << WIDTH_M1_SHIFT_); + return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1); } StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { @@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { return (Idx < 0) ? "" : Opr[Idx].Name; } -void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) { - Id = (Val & ID_MASK_) >> ID_SHIFT_; - Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_; - Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; -} - } // namespace Hwreg //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f35e7744..6826cd2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs, } // end namespace IsaInfo +// Represents a field in an encoded value. +template <unsigned HighBit, unsigned LowBit, unsigned D = 0> +struct EncodingField { + static_assert(HighBit >= LowBit, "Invalid bit range!"); + static constexpr unsigned Offset = LowBit; + static constexpr unsigned Width = HighBit - LowBit + 1; + + using ValueType = unsigned; + static constexpr ValueType Default = D; + + ValueType Value; + constexpr EncodingField(ValueType Value) : Value(Value) {} + + constexpr uint64_t encode() const { return Value; } + static ValueType decode(uint64_t Encoded) { return Encoded; } +}; + +// A helper for encoding and decoding multiple fields. +template <typename... Fields> struct EncodingFields { + static constexpr uint64_t encode(Fields... Values) { + return ((Values.encode() << Values.Offset) | ...); + } + + static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) { + return {Fields::decode((Encoded >> Fields::Offset) & + maxUIntN(Fields::Width))...}; + } +}; + LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); @@ -870,15 +899,6 @@ struct Waitcnt { : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} - static Waitcnt allZero(bool Extended, bool HasStorecnt) { - return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0) - : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u); - } - - static Waitcnt allZeroExceptVsCnt(bool Extended) { - return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u); - } - bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } bool hasWaitExceptStoreCnt() const { @@ -1030,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); namespace Hwreg { +using HwregId = EncodingField<5, 0>; +using HwregOffset = EncodingField<10, 6>; + +struct HwregSize : EncodingField<15, 11, 32> { + using EncodingField::EncodingField; + constexpr uint64_t encode() const { return Value - 1; } + static ValueType decode(uint64_t Encoded) { return Encoded + 1; } +}; + +using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>; + LLVM_READONLY int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI); @@ -1043,13 +1074,8 @@ LLVM_READNONE bool isValidHwregWidth(int64_t Width); LLVM_READNONE -uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width); - -LLVM_READNONE StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI); -void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width); - } // namespace Hwreg namespace DepCtr { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 99f8e8e..f5424cf 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un let OutsDPP = (outs Src0RC32:$vdst); let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; - let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi); + let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi); let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret; let OutsVOP3DPP = (outs Src0RC64:$vdst); @@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP16 <op, ps, Gen.Subtarget, p> { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } //===----------------------------------------------------------------------===// @@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> { defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16, - DecoderNamespace = "DPP" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp<Gen, op, opName>; } @@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> { defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8, - DecoderNamespace = "DPP8" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp8<Gen, op, opName>; } @@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" @@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31-25} = 0x3f; //encoding } -multiclass VOP1Only_Real_vi <bits<10> op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { +let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1Only_Real_vi <bits<10> op> { def _vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; } -} -multiclass VOP1_Real_e32e64_vi <bits<10> op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1_Real_e32e64_vi <bits<10> op> { def _e32_vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; @@ -1389,44 +1385,41 @@ def : GCNPat < // GFX9 //===----------------------------------------------------------------------===// -multiclass VOP1_Real_gfx9 <bits<10> op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { + multiclass VOP1_Real_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; - } - - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; - -} -multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { - defm NAME : VOP1_Real_e32e64_vi <op>; + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; } - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { - let Inst{42-40} = 6; - } + multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { + defm NAME : VOP1_Real_e32e64_vi <op>; - if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + let Inst{42-40} = 6; + } + + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + } } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; -let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in +let AssemblerPredicate = isGFX940Plus in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; let OtherPredicates = [HasFP8ConversionInsts] in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 4437d5f..13fe79b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v getVregSrcForVT<Src2VT>.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret; @@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT<Src2VT>.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, getVregSrcForVT<Src2VT>.ret:$src2, // stub argument @@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> { let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src2Mod = FP32InputMods; // dummy unused modifiers let Src2RC64 = VGPRSrc_32; // stub argument } @@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/ Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let OutsVOP3DPP = Outs64; @@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1> Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let HasExt = 1; let HasExtDPP = 1; @@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> { FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, FPVRegInputMods:$src0_modifiers, Src0DPP:$src0, FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src0ModVOP3DPP = FPVRegInputMods; let Src1ModVOP3DPP = FPVRegInputMods; @@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen, VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen, VOP2_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } //===------------------------- VOP2 (with name) -------------------------===// @@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"; } } @@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); - let DecoderNamespace = "DPP8"; } if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_w32_gfx10 : @@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> { VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } -let AssemblerPredicate = isGFX8Only in { +let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in { multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> { def _e32_vi : @@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : @@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; } } -} -let AssemblerPredicate = isGFX9Only in { +} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" + +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { def _e32_gfx9 : @@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } def _e64_gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : @@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } } multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { def _e32_gfx9 : VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{ - let DecoderNamespace = "GFX9"; - } + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; def _e64_gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, - VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - let DecoderNamespace = "GFX9"; - } + VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, @@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { - let DecoderNamespace = "GFX9"; - } + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } -} // AssemblerPredicate = isGFX9Only +} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 396ae9c..7198a40 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, - VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasExtVOP3DPP = 1; @@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, - op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 74f451b..ac3c8f9 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>, let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, - neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi); + neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi); let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, - DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); } multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> { @@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget, let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> @@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> let SchedRW = ps.SchedRW; let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } //===----------------------------------------------------------------------===// @@ -1486,7 +1488,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), Gen.Subtarget> { let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1496,7 +1498,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> { let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1613,7 +1615,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { - let SubtargetPredicate = isGFX940Plus, + let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index fe52a0e..e5e8244 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, let AsmVariantName = AMDGPUAsmVariants.Default; let SubtargetPredicate = AssemblerPredicate; + + string DecoderNamespace; // dummy } multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> { @@ -766,7 +768,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP16 = AsmDPP#"$fi"; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); @@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> //===----------------------------------------------------------------------===// multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>, - VOPCe<op{7-0}>; - def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>, + VOPCe<op{7-0}>; + def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>; - def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>; + def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>; - def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>; + def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; - def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; + def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - // 32 and 64 bit forms of the instruction have _e32 and _e64 - // respectively appended to their assembly mnemonic. - // _e64 is printed as part of the VOPDstS64orS32 operand, whereas - // the destination-less 32bit forms add it to the asmString here. - VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">, - VOPCe<op{7-0}>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic, - pseudo_mnemonic), - asm_name, ps32.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>; - def _e64#Gen.Suffix : - VOP3_Real<ps64, Gen.Subtarget, asm_name>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic, - pseudo_mnemonic), - asm_name, ps64.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + // 32 and 64 bit forms of the instruction have _e32 and _e64 + // respectively appended to their assembly mnemonic. + // _e64 is printed as part of the VOPDstS64orS32 operand, whereas + // the destination-less 32bit forms add it to the asmString here. + VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">, + VOPCe<op{7-0}>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic, + pseudo_mnemonic), + asm_name, ps32.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>; + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget, asm_name>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic, + pseudo_mnemonic), + asm_name, ps64.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, - Gen.Subtarget, asm_name>; - def _e32_dpp_w32#Gen.Suffix - : VOPC_DPP16<op{7-0}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix - : VOPC_DPP16<op{7-0}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; + def _e32_dpp_w32#Gen.Suffix + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; - def _e32_dpp8_w32#Gen.Suffix - : VOPC_DPP8<op{7-0}, ps32, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix - : VOPC_DPP8<op{7-0}, ps32, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; + def _e32_dpp8_w32#Gen.Suffix + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; - def _e64_dpp_w32#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; - def _e64_dpp8_w32#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; + def _e64_dpp8_w32#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name, @@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name, VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>; multiclass VOPCX_Real<GFXGen Gen, bits<9> op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - VOPC_Real<ps32, Gen.Subtarget>, - VOPCe<op{7-0}> { - let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) - # " " # ps32.AsmOperands; - } - def _e64#Gen.Suffix : - VOP3_Real<ps64, Gen.Subtarget>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", ps64.Mnemonic) - # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + VOPC_Real<ps32, Gen.Subtarget>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) + # " " # ps32.AsmOperands; + } + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", ps64.Mnemonic) + # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix - : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> { - let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; - } + def _e32_dpp#Gen.Suffix + : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { - let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { - let AsmString = !subst("_nosdst", "", psDPP.OpName) - # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) + # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { - let AsmString = !subst("_nosdst", "", ps64.OpName) - # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { + let AsmString = !subst("_nosdst", "", ps64.OpName) + # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix - : VOPC_Real<ps32, Gen.Subtarget, asm_name>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic), - pseudo_mnemonic), - asm_name, ps32.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>, - VOPCe<op{7-0}> { - let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; - } - def _e64#Gen.Suffix - : VOP3_Real<ps64, Gen.Subtarget, asm_name>, - MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic), - pseudo_mnemonic), - asm_name, ps64.AsmVariantName>, - Requires<[Gen.AssemblerPredicate]>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ? ; // sdst - let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix + : VOPC_Real<ps32, Gen.Subtarget, asm_name>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic), + pseudo_mnemonic), + asm_name, ps32.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>, + VOPCe<op{7-0}> { + let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; + } + def _e64#Gen.Suffix + : VOP3_Real<ps64, Gen.Subtarget, asm_name>, + MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic), + pseudo_mnemonic), + asm_name, ps64.AsmVariantName>, + Requires<[Gen.AssemblerPredicate]>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ? ; // sdst + let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp"); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, - Gen.Subtarget, asm_name>; - } - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name, @@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Only in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass VOPC_Real_gfx10<bits<9> op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, - VOPCe<op{7-0}>; - def _e64_gfx10 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}>; + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in { } multiclass VOPCX_Real_gfx10<bits<9> op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, - VOPCe<op{7-0}> { - let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr) - # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands; - } - - def _e64_gfx10 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) - # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr) + # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands; + } + + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) + # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; + } if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in { defm : VOPCXInstAliases<NAME, "gfx10">; } -} // End AssemblerPredicate = isGFX10Only +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; @@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>; // GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX6GFX7 in { +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { multiclass VOPC_Real_gfx6_gfx7<bits<9> op> { - let DecoderNamespace = "GFX6GFX7" in { - def _e32_gfx6_gfx7 : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, - VOPCe<op{7-0}>; - def _e64_gfx6_gfx7 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX6GFX7" + def _e32_gfx6_gfx7 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe<op{7-0}>; + def _e64_gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases<NAME, "gfx6_gfx7">; } -} // End AssemblerPredicate = isGFX6GFX7 +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> : VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 801afab..80d7d96 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let VALU = 1; let DPP = 1; let Size = 8; + let IsPacked = P.IsPacked; let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); @@ -835,7 +836,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); - let DecoderNamespace = "DPP"; + let DecoderNamespace = "GFX8"; VOPProfile Pfl = P; } @@ -906,7 +907,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); - let DecoderNamespace = "DPP"; + let DecoderNamespace = "GFX8"; } class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, @@ -1350,7 +1351,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen, VOP3_DPP16 <op, ps, Gen.Subtarget, opName> { let AssemblerPredicate = Gen.AssemblerPredicate; let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1463,7 +1464,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName, multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1473,7 +1474,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { let Inst{11} = ?; let Inst{12} = ?; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1482,7 +1483,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, - DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"), True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate) in { @@ -1505,7 +1506,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName, defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp"); def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>, SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> { - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1514,7 +1515,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index c5199aab..00a29f8 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -25,42 +25,6 @@ using namespace llvm; using namespace LegalizeActions; -/// FIXME: The following static functions are SizeChangeStrategy functions -/// that are meant to temporarily mimic the behaviour of the old legalization -/// based on doubling/halving non-legal types as closely as possible. This is -/// not entirly possible as only legalizing the types that are exactly a power -/// of 2 times the size of the legal types would require specifying all those -/// sizes explicitly. -/// In practice, not specifying those isn't a problem, and the below functions -/// should disappear quickly as we add support for legalizing non-power-of-2 -/// sized types further. -static void addAndInterleaveWithUnsupported( - LegacyLegalizerInfo::SizeAndActionsVec &result, - const LegacyLegalizerInfo::SizeAndActionsVec &v) { - for (unsigned i = 0; i < v.size(); ++i) { - result.push_back(v[i]); - if (i + 1 < v[i].first && i + 1 < v.size() && - v[i + 1].first != v[i].first + 1) - result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported}); - } -} - -static LegacyLegalizerInfo::SizeAndActionsVec -widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) { - assert(v.size() >= 1); - assert(v[0].first > 17); - LegacyLegalizerInfo::SizeAndActionsVec result = { - {1, LegacyLegalizeActions::Unsupported}, - {8, LegacyLegalizeActions::WidenScalar}, - {9, LegacyLegalizeActions::Unsupported}, - {16, LegacyLegalizeActions::WidenScalar}, - {17, LegacyLegalizeActions::Unsupported}}; - addAndInterleaveWithUnsupported(result, v); - auto Largest = result.back().first; - result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported}); - return result; -} - static bool AEABI(const ARMSubtarget &ST) { return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI(); } @@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { .libcallFor({s32}) .clampScalar(0, s32, s32); - for (unsigned Op : {G_SREM, G_UREM}) { - LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16); - if (HasHWDivide) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower); - else if (AEABI(ST)) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom); - else - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall); - } + auto &REMBuilder = + getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32); + if (HasHWDivide) + REMBuilder.lowerFor({s32}); + else if (AEABI(ST)) + REMBuilder.customFor({s32}); + else + REMBuilder.libcallFor({s32}); getActionDefinitionsBuilder(G_INTTOPTR) .legalFor({{p0, s32}}) @@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { LoadStoreBuilder.maxScalar(0, s32); - for (auto Ty : {s32, s64}) - LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower); + getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64}); getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64}); diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 5215813..8a3454c 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">; def UnaryFloatCategory : DXILOpCategory<"Unary float">; def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">; -// Following are the scalar types supported by DXIL operations and are synonymous -// to llvm_*_ty defined for readability and ease of use in the context of this file. - -def voidTy : LLVMType<isVoid>; - -// Floating point types -def f16Ty : LLVMType<f16>; -def f32Ty : LLVMType<f32>; -def f64Ty : LLVMType<f64>; - -// Integer types -def i1Ty : LLVMType<i1>; -def i8Ty : LLVMType<i8>; -def i16Ty : LLVMType<i16>; -def i32Ty : LLVMType<i32>; -def i64Ty : LLVMType<i64>; +// Represent as any pointer type with an option to change to a qualified pointer +// type with address space specified. +def dxil_handle_ty : LLVMAnyPointerType; +def dxil_cbuffer_ty : LLVMAnyPointerType; +def dxil_resource_ty : LLVMAnyPointerType; // The parameter description for a DXIL operation -class DXILOpParameter<int pos, string type, string name, string doc, +class DXILOpParameter<int pos, LLVMType type, string name, string doc, bit isConstant = 0, string enumName = "", int maxValue = 0> { int Pos = pos; // Position in parameter list - string Type = type; // LLVM type name, $o for overload, $r for resource - // type, $cb for legacy cbuffer, $u4 for u4 struct + LLVMType ParamType = type; // Parameter type string Name = name; // Short, unique parameter name string Doc = doc; // Description of this parameter bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR @@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; } def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.", - [f16Ty,f32Ty], ReadNone, + [llvm_half_ty, llvm_float_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "value", "input value"> + DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value"> ], ["floats"]>, LLVMIntrinsic<int_sin>; -def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", - [i16Ty,i32Ty,i64Ty], ReadNone, +def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", + [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "a", "input value">, - DXILOpParameter<3, "$o", "b", "input value"> + DXILOpParameter<0, llvm_anyint_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyint_ty, "a", "input value">, + DXILOpParameter<3, llvm_anyint_ty, "b", "input value"> ], ["uints"]>, LLVMIntrinsic<int_umax>; -def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty], ReadNone, +def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic<int_dx_thread_id>; -def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty], ReadNone, +def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "group ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read"> + DXILOpParameter<0, llvm_i32_ty, "", "group ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read"> ]>, LLVMIntrinsic<int_dx_group_id>; -def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, - "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty], ReadNone, +def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, + "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID in group component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic<int_dx_thread_id_in_group>; -def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, - "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty], ReadNone, +def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, + "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode"> + DXILOpParameter<0, llvm_i32_ty, "", "result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode"> ]>, LLVMIntrinsic<int_dx_flattened_thread_id_in_group>; diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 76f99b4..2870f0b 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen HexagonFrameLowering.cpp HexagonGenExtract.cpp HexagonGenInsert.cpp + HexagonGenMemAbsolute.cpp HexagonGenMux.cpp HexagonGenPredicate.cpp HexagonHardwareLoops.cpp @@ -50,6 +51,7 @@ add_llvm_target(HexagonCodeGen HexagonOptAddrMode.cpp HexagonOptimizeSZextends.cpp HexagonPeephole.cpp + HexagonPostIncOpt.cpp HexagonRDFOpt.cpp HexagonRegisterInfo.cpp HexagonSelectionDAGInfo.cpp @@ -60,6 +62,7 @@ add_llvm_target(HexagonCodeGen HexagonTargetMachine.cpp HexagonTargetObjectFile.cpp HexagonTargetTransformInfo.cpp + HexagonTfrCleanup.cpp HexagonVectorCombine.cpp HexagonVectorLoopCarriedReuse.cpp HexagonVectorPrint.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 6024d9f..3b8234c 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) { return false; const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); RegHalf H; - if (!matchHalf(0, RC, 0, H)) + unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0; + if (!matchHalf(0, RC, B, H)) return false; if (H.Low) return false; diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp new file mode 100644 index 0000000..afd4963 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -0,0 +1,274 @@ +//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This pass traverses through all the basic blocks in a function and converts +// an indexed load/store with offset "0" to a absolute-set load/store +// instruction as long as the use of the register in the new instruction +// dominates the rest of the uses and there are more than 2 uses. + +#include "HexagonTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "hexagon-abs" + +using namespace llvm; + +STATISTIC(HexagonNumLoadAbsConversions, + "Number of Load instructions converted to absolute-set form"); +STATISTIC(HexagonNumStoreAbsConversions, + "Number of Store instructions converted to absolute-set form"); + +namespace llvm { +FunctionPass *createHexagonGenMemAbsolute(); +void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry); +} // namespace llvm + +namespace { + +class HexagonGenMemAbsolute : public MachineFunctionPass { + const HexagonInstrInfo *TII; + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + +public: + static char ID; + HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) { + initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Hexagon Generate Load/Store Set Absolute Address Instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + static bool isValidIndexedLoad(int &Opcode, int &NewOpcode); + static bool isValidIndexedStore(int &Opcode, int &NewOpcode); +}; +} // namespace + +char HexagonGenMemAbsolute::ID = 0; + +INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute", + "Hexagon Generate Load/Store Set Absolute Address Instruction", + false, false) + +bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(Fn.getFunction())) + return false; + + TII = Fn.getSubtarget<HexagonSubtarget>().getInstrInfo(); + MRI = &Fn.getRegInfo(); + TRI = Fn.getRegInfo().getTargetRegisterInfo(); + + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + + // Loop over all of the basic blocks + for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); + MBBb != MBBe; ++MBBb) { + MachineBasicBlock *MBB = &*MBBb; + // Traverse the basic block + for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); + ++MII) { + MachineInstr *MI = &*MII; + int Opc = MI->getOpcode(); + if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi) + continue; + + const MachineOperand &MO = MI->getOperand(0); + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned DstReg = MO.getReg(); + if (MRI->use_nodbg_empty(DstReg)) + continue; + + typedef MachineRegisterInfo::use_nodbg_iterator use_iterator; + use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg); + + MachineInstr *NextMI = NextUseMI->getParent(); + int NextOpc = NextMI->getOpcode(); + int NewOpc; + bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc); + + if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc)) + continue; + + // Base and Offset positions for load and store instructions + // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm) + // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src) + unsigned BaseRegPos, ImmPos, RegPos; + if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos)) + continue; + RegPos = IsLoad ? 0 : 2; + + bool IsGlobal = MI->getOperand(1).isGlobal(); + if (!MI->getOperand(1).isImm() && !IsGlobal) + continue; + + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool Scalable; + TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI); + + // Ensure BaseOp is non-null and register type. + if (!BaseOp || !BaseOp->isReg()) + continue; + + if (Scalable) + continue; + + unsigned BaseReg = BaseOp->getReg(); + if ((DstReg != BaseReg) || (Offset != 0)) + continue; + + const MachineOperand &MO0 = NextMI->getOperand(RegPos); + + if (!MO0.isReg()) + continue; + + unsigned LoadStoreReg = MO0.getReg(); + + // Store: Bail out if the src and base are same (def and use on same + // register). + if (LoadStoreReg == BaseReg) + continue; + + // Insert the absolute-set instruction "I" only if the use of the + // BaseReg in "I" dominates the rest of the uses of BaseReg and if + // there are more than 2 uses of this BaseReg. + bool Dominates = true; + unsigned Counter = 0; + for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) { + Counter++; + if (!MDT.dominates(NextMI, I->getParent())) + Dominates = false; + } + + if ((!Dominates) || (Counter < 3)) + continue; + + // If we reach here, we have met all the conditions required for the + // replacement of the absolute instruction. + LLVM_DEBUG({ + dbgs() << "Found a pair of instructions for absolute-set " + << (IsLoad ? "load" : "store") << "\n"; + dbgs() << *MI; + dbgs() << *NextMI; + }); + MachineBasicBlock *ParentBlock = NextMI->getParent(); + MachineInstrBuilder MIB; + if (IsLoad) { // Insert absolute-set load instruction + ++HexagonNumLoadAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), LoadStoreReg) + .addReg(DstReg, RegState::Define); + } else { // Insert absolute-set store instruction + ++HexagonNumStoreAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), DstReg); + } + + MachineOperand ImmOperand = MI->getOperand(1); + if (IsGlobal) + MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(), + ImmOperand.getTargetFlags()); + else + MIB.addImm(ImmOperand.getImm()); + + if (IsLoad) + MIB->getOperand(0).setSubReg(MO0.getSubReg()); + else + MIB.addReg(LoadStoreReg, 0, MO0.getSubReg()); + + LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n"); + // Erase the instructions that got replaced. + MII = MBB->erase(MI); + --MII; + NextMI->getParent()->erase(NextMI); + } + } + + return true; +} + +bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::L2_loadrb_io: + NewOpc = Hexagon::L4_loadrb_ap; + break; + case Hexagon::L2_loadrh_io: + NewOpc = Hexagon::L4_loadrh_ap; + break; + case Hexagon::L2_loadri_io: + NewOpc = Hexagon::L4_loadri_ap; + break; + case Hexagon::L2_loadrd_io: + NewOpc = Hexagon::L4_loadrd_ap; + break; + case Hexagon::L2_loadruh_io: + NewOpc = Hexagon::L4_loadruh_ap; + break; + case Hexagon::L2_loadrub_io: + NewOpc = Hexagon::L4_loadrub_ap; + break; + default: + Result = false; + } + + return Result; +} + +bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::S2_storerd_io: + NewOpc = Hexagon::S4_storerd_ap; + break; + case Hexagon::S2_storeri_io: + NewOpc = Hexagon::S4_storeri_ap; + break; + case Hexagon::S2_storerh_io: + NewOpc = Hexagon::S4_storerh_ap; + break; + case Hexagon::S2_storerb_io: + NewOpc = Hexagon::S4_storerb_ap; + break; + default: + Result = false; + } + + return Result; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonGenMemAbsolute() { + return new HexagonGenMemAbsolute(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 619c7dc..91cc930 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const { return getAddrMode(MI) == HexagonII::PostInc; } +bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const { + unsigned BasePos, OffsetPos; + if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return false; + return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm(); +} + // Returns true if an instruction is predicated irrespective of the predicate // sense. For example, all of the following will return true. // if (p0) R1 = add(R2, R3) @@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const { Opcode == Hexagon::J2_loop1rext; } +bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + case Hexagon::L2_loadalignb_pci: + case Hexagon::L2_loadalignb_pcr: + case Hexagon::L2_loadalignh_pci: + case Hexagon::L2_loadalignh_pcr: + case Hexagon::L2_loadbsw2_pci: + case Hexagon::L2_loadbsw2_pcr: + case Hexagon::L2_loadbsw4_pci: + case Hexagon::L2_loadbsw4_pcr: + case Hexagon::L2_loadbzw2_pci: + case Hexagon::L2_loadbzw2_pcr: + case Hexagon::L2_loadbzw4_pci: + case Hexagon::L2_loadbzw4_pcr: + case Hexagon::L2_loadrb_pci: + case Hexagon::L2_loadrb_pcr: + case Hexagon::L2_loadrd_pci: + case Hexagon::L2_loadrd_pcr: + case Hexagon::L2_loadrh_pci: + case Hexagon::L2_loadrh_pcr: + case Hexagon::L2_loadri_pci: + case Hexagon::L2_loadri_pcr: + case Hexagon::L2_loadrub_pci: + case Hexagon::L2_loadrub_pcr: + case Hexagon::L2_loadruh_pci: + case Hexagon::L2_loadruh_pcr: + case Hexagon::S2_storerbnew_pci: + case Hexagon::S2_storerbnew_pcr: + case Hexagon::S2_storerb_pci: + case Hexagon::S2_storerb_pcr: + case Hexagon::S2_storerd_pci: + case Hexagon::S2_storerd_pcr: + case Hexagon::S2_storerf_pci: + case Hexagon::S2_storerf_pcr: + case Hexagon::S2_storerhnew_pci: + case Hexagon::S2_storerhnew_pcr: + case Hexagon::S2_storerh_pci: + case Hexagon::S2_storerh_pcr: + case Hexagon::S2_storerinew_pci: + case Hexagon::S2_storerinew_pcr: + case Hexagon::S2_storeri_pci: + case Hexagon::S2_storeri_pcr: + return true; + } + return false; +} + bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return false; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index e496995..65783c5 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -434,6 +434,8 @@ public: bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const; bool PredOpcodeHasJMP_c(unsigned Opcode) const; bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; + bool isPostIncWithImmOffset(const MachineInstr &MI) const; + bool isCircBufferInstr(const MachineInstr &MI) const; unsigned getAddrMode(const MachineInstr &MI) const; MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset, diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp new file mode 100644 index 0000000..4c845f2 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp @@ -0,0 +1,689 @@ +//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Convert post-inc addressing mode into base-offset addressing mode. +// Ex: +// original loop: +// v1 = phi(v0, v3) +// v2,v3 = post_load v1, 4 + +// Often, unroller creates below form of post-increments: +// v1 = phi(v0, v3') +// v2,v3 = post_load v1, 4 +// v2',v3'= post_load v3, 4 + +// This can be optimized in two ways + +// 1. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v3', -4 +// +// 2. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v1, 4 +// +// Option 2 is favored as we can packetize two memory operations in a single +// packet. However, this is not always favorable due to memory dependences +// and in cases where we form a bigger chain of post-increment ops that will +// create more spills as we can not execute post-increment ops with out +// executing base-offset instructions. +//===----------------------------------------------------------------------===// +#include "HexagonInstrInfo.h" +#include "HexagonSubtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagon-postincopt" + +static cl::opt<unsigned> PostIncChainThreshold( + "post-inc-chain-threshold", cl::Hidden, cl::init(4), + cl::desc("Limit the number of post-inc instructions in a chain.")); + +static cl::opt<bool> PreferPostIncStore( + "prefer-post-inc-store", cl::Hidden, cl::init(true), + cl::desc("Prefer post-inc store in a list of loads and stores.")); + +namespace llvm { +void initializeHexagonPostIncOptPass(PassRegistry &); +FunctionPass *createHexagonPostIncOpt(); +} // namespace llvm + +namespace { + +class HexagonPostIncOpt : public MachineFunctionPass { + MachineLoopInfo *MLI = nullptr; + const HexagonInstrInfo *HII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const HexagonSubtarget *HST = nullptr; + +public: + static char ID; + + HexagonPostIncOpt() : MachineFunctionPass(ID) { + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + bool translatePostIncsInLoop(MachineBasicBlock &MBB); + void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const; + void replacePostIncWithBaseOffset(MachineInstr &MI) const; + bool isPostIncInsn(MachineInstr &MI) const; + void foldAdds(MachineBasicBlock &MBB) const; + void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const; + void removeDeadInstructions(MachineBasicBlock &MBB) const; + + void generatePostInc(MachineBasicBlock &MBB); + bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + + bool isValidOffset(const MachineInstr &MI, int64_t Offset) const; + bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const; +}; + +class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs { + HexagonPostIncOpt &Pass; + +public: + HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF, + MachineLoopInfo *MLI) + : ScheduleDAGInstrs(MF, MLI, false), Pass(P){}; + void schedule() override; + ScheduleDAGTopologicalSort &getTopo() { return Topo; }; +}; + +} // End anonymous namespace. + +char HexagonPostIncOpt::ID = 0; + +INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE, + "Hexagon Post-Inc-Opt Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass", + false, false) + +/// Return true if MIA dominates MIB. +static bool dominates(MachineInstr *MIA, MachineInstr *MIB) { + if (MIA->getParent() != MIB->getParent()) + return false; // Don't know since machine dominator tree is out of date. + + MachineBasicBlock *MBB = MIA->getParent(); + MachineBasicBlock::iterator I = MBB->instr_begin(); + // Iterate over the basic block until MIA or MIB is found. + for (; &*I != MIA && &*I != MIB; ++I) + ; + + // MIA dominates MIB if MIA is found first. + return &*I == MIA; +} + +// Return the Phi register value that comes from the loop block. +static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i).getReg(); + return UINT_MAX; +} + +static bool isAddWithImmValue(const MachineInstr &MI) { + // FIXME: For now, only deal with adds that have strict immediate values. + // Some A2_addi instructions can be of the form. + // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16 + return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm(); +} + +// Compute the number of 'real' instructions in the basic block by +// ignoring terminators. +static unsigned getBasicBlockSize(MachineBasicBlock &MBB) { + unsigned size = 0; + for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator())) + if (!I.isDebugInstr()) + size++; + return size; +} + +// Setup Post increment Schedule DAG. +static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG, + MachineBasicBlock &MBB) { + PIDAG.startBlock(&MBB); + PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(), + getBasicBlockSize(MBB)); + // Build the graph. + PIDAG.schedule(); + // exitRegion() is an empty function in base class. So, safe to call it here. + PIDAG.exitRegion(); +} + +// Check if post-increment candidate has any memory dependence on any +// instruction in the chain. +static bool hasMemoryDependency(SUnit *PostIncSU, + SmallVector<MachineInstr *, 4> &UseList) { + + // FIXME: Fine tune the order dependence. Probably can only consider memory + // related OrderKind. + for (auto &Dep : PostIncSU->Succs) + if (Dep.getKind() == SDep::Order) + if (std::find(UseList.begin(), UseList.end(), + Dep.getSUnit()->getInstr()) != UseList.end()) + return true; + + return false; +} + +// Fold an add with immediate into either an add or a load or a store. +void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n"); + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + if (!isAddWithImmValue(MI)) + continue; + unsigned DefReg = MI.getOperand(0).getReg(); + unsigned AddReg = MI.getOperand(1).getReg(); + int64_t AddImm = MI.getOperand(2).getImm(); + + SmallVector<MachineInstr *, 4> UseList; + // Gather the uses of add instruction's def reg. + for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + // Deal with only the instuctions that belong to this block. + // If we cross this block, the generation of post-increment logic + // will not be able to transform to post-inc due to dominance. + if (UseMI->getParent() == &MBB) + UseList.push_back(UseMI); + } + + if (UseList.empty()) + continue; + + LLVM_DEBUG({ + dbgs() << "Current instruction considered for folding \n"; + MI.dump(); + }); + + for (auto UseMI : UseList) { + if (isAddWithImmValue(*UseMI)) { + int64_t NewImm = AddImm + UseMI->getOperand(2).getImm(); + // Fold if the new immediate is with in the range. + if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + UseMI->getOperand(1).setReg(AddReg); + UseMI->getOperand(2).setImm(NewImm); + LLVM_DEBUG(UseMI->dump()); + } + } else if (HII->isBaseImmOffset(*UseMI)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + updateBaseAndOffset(*UseMI, MI); + LLVM_DEBUG(UseMI->dump()); + } + LLVM_DEBUG(dbgs() << "\n"); + } + } + removeDeadInstructions(MBB); + LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n"); +} + +void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI, + MachineInstr &AddMI) const { + assert(HII->isBaseImmOffset(MI)); + unsigned BasePos, OffsetPos; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + + MachineOperand &OffsetOp = MI.getOperand(OffsetPos); + MachineOperand &BaseOp = MI.getOperand(BasePos); + + if (BaseOp.getReg() != AddMI.getOperand(0).getReg()) + return; + + unsigned IncBase = AddMI.getOperand(1).getReg(); + int64_t IncValue = AddMI.getOperand(2).getImm(); + + int64_t NewOffset = OffsetOp.getImm() + IncValue; + if (!isValidOffset(MI, NewOffset)) + return; + + OffsetOp.setImm(NewOffset); + BaseOp.setReg(IncBase); +} + +void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const { + // For MBB, check that the value defined by each instruction is used. + // If not, delete it. + for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(), + ME = MBB.instr_rend(); + MI != ME;) { + // From DeadMachineInstructionElem. Don't delete inline assembly. + if (MI->isInlineAsm()) { + ++MI; + continue; + } + bool SawStore = false; + // Check if it's safe to remove the instruction due to side effects. + if (!MI->isSafeToMove(nullptr, SawStore)) { + ++MI; + continue; + } + unsigned Uses = 0; + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef()) + continue; + unsigned reg = MOI->getReg(); + // Assume physical registers are used. + if (Register::isPhysicalRegister(reg)) { + Uses++; + continue; + } + if (MRI->use_begin(reg) != MRI->use_end()) + Uses++; + } + if (!Uses) { + MI++->eraseFromParent(); + continue; + } + ++MI; + } +} + +bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const { + // Predicated post-increments are not yet handled. (ISel is not generating + // them yet). Circular buffer instructions should not be handled. + return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) && + !HII->isCircBufferInstr(MI)); +} + +/// For instructions with a base and offset, return true if the new Offset +/// is a valid value with the correct alignment. +bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI, + int64_t Offset) const { + if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false)) + return false; + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + return (Offset & AlignMask) == 0; +} + +bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI, + int IncVal) const { + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + if ((IncVal & AlignMask) != 0) + return false; + + // Number of total bits in the instruction used to encode Inc value. + unsigned IncBits = 4; + // For HVX instructions, the offset is 3. + if (HexagonII::isCVI(MI.getDesc())) + IncBits = 3; + + IncBits += Log2_32(HII->getMemAccessSize(MI)); + if (HII->getMemAccessSize(MI) > 8) + IncBits = 16; + + int MinValidVal = -1U << (IncBits - 1); + int MaxValidVal = ~(-1U << (IncBits - 1)); + return (IncVal >= MinValidVal && IncVal <= MaxValidVal); +} + +void HexagonPostIncOptSchedDAG::schedule() { + AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults(); + buildSchedGraph(AA); +} + +// Replace post-increment operations with base+offset counterpart. +void HexagonPostIncOpt::replacePostIncWithBaseOffset( + MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with " + "base+offset counterparts.\n"); + + SmallVector<MachineInstr *, 4> MIList; + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + // Check for eligible post-inc candidates. + if (!isPostIncInsn(MI)) + continue; + MIList.push_back(&MI); + } + + for (auto MI : MIList) + replacePostIncWithBaseOffset(*MI); + + LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n"); +} + +void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const { + short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode()); + if (NewOpcode < 0) + return; + + unsigned BasePos = 0, OffsetPos = 0; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos); + const MachineOperand &PostIncBase = MI.getOperand(BasePos); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineOperand *PostIncDest; + MachineInstrBuilder MIB; + if (MI.mayLoad()) { + PostIncDest = &MI.getOperand(1); + const MachineOperand &LDValue = MI.getOperand(0); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(PostIncBase).addImm(0); + } else { + PostIncDest = &MI.getOperand(0); + const MachineOperand &STValue = MI.getOperand(3); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(PostIncBase).addImm(0).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), MI); + + // Create an add instruction for the post-inc addition of offset. + MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi)); + MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset); + + LLVM_DEBUG({ + dbgs() << "\n"; + MI.dump(); + dbgs() << "\tis tranformed to \n"; + MIB->dump(); + MIBA->dump(); + dbgs() << "\n\n"; + }); + + MI.eraseFromParent(); +} + +void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) { + LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n"); + MachineBasicBlock::iterator MII = MBB.getFirstNonPHI(); + MachineBasicBlock::iterator MIE = MBB.instr_begin(); + bool isOK = true; + while (MII != MIE) { + MachineInstr *Phi = &*std::prev(MII); + MII = std::prev(MII); + unsigned LoopVal = getLoopPhiReg(Phi, &MBB); + if (LoopVal == UINT_MAX) + continue; + MachineInstr *LoopInst = MRI->getVRegDef(LoopVal); + if (!isAddWithImmValue(*LoopInst)) + continue; + + if (LoopInst->getOpcode() != Hexagon::A2_addi) + continue; + + unsigned AddReg = LoopInst->getOperand(1).getReg(); + int64_t AddImm = LoopInst->getOperand(2).getImm(); + SmallVector<MachineInstr *, 4> UseList; + MachineInstr *PostIncCandidate = nullptr; + + // Find the probable candidates for Post-increment instruction. + SmallVector<MachineInstr *, 4> CandList; + for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + + if (UseMI == LoopInst) + continue; + + if (!dominates(UseMI, LoopInst)) { + isOK = false; + break; + } + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool OffsetIsScalable; + if (!HII->isBaseImmOffset(*UseMI) || + !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset, + OffsetIsScalable, TRI)) { + isOK = false; + break; + } + int64_t NewOffset = Offset - AddImm; + if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() || + BaseOp->getReg() != AddReg) { + isOK = false; + break; + } + if (OffsetIsScalable) { + isOK = false; + break; + } + if (Offset == 0) { + // If you have stores in the chain, make sure they are in the beginning + // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST, + // ST. + if (UseMI->mayStore() && PreferPostIncStore) + CandList.insert(CandList.begin(), UseMI); + else + CandList.push_back(UseMI); + continue; + } + UseList.push_back(UseMI); + } + + if (!isOK) + continue; + + for (auto MI : CandList) { + if (!PostIncCandidate) + PostIncCandidate = MI; + // Push the rest of the list for updation. + else + UseList.push_back(MI); + } + + // If a candidate is found, replace it with the post-inc instruction. + // Also, adjust offset for other uses as needed. + if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst)) + continue; + + // Logic to determine what the base register to be. + // There are two choices: + // 1. New address register after we updated the post-increment candidate. + // v2,v3 = post_load v1, 4 + // v3 is the choice here. + // 2. The base register we used in post-increment candidate. + // v2,v3 = post_load v1, 4 + // v1 is the choice here. + // Use v3 if there is a memory dependence between post-inc instruction and + // any other instruction in the chain. + // FIXME: We can do some complex DAG analysis based off height and depth and + // selectively update other instructions in the chain. Use v3 if there are + // more instructions in the chain, otherwise we will end up increasing the + // height of the DAG resulting in more spills. By default we have a + // threshold controlled by the option "post-inc-chain-threshold" which is + // set to 4. v1 is preferred as we can packetize two memory operations in a + // single packet in scalar core. But it heavily depends on the structure of + // DAG. + bool UpdateBaseToNew = false; + + // Do not bother to build a DAG and analyze if the Use list is empty. + if (!UseList.empty()) { + MachineFunction *MF = MBB.getParent(); + // Setup the Post-inc schedule DAG. + HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI); + initPISchedDAG(PIDAG, MBB); + SUnit *SU = PIDAG.getSUnit(PostIncCandidate); + if (hasMemoryDependency(SU, UseList) || + UseList.size() >= PostIncChainThreshold) + UpdateBaseToNew = true; + } + + if (UpdateBaseToNew) { + LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the " + "base register of post-increment\n"); + for (auto UseMI : UseList) { + if (!dominates(PostIncCandidate, UseMI)) + continue; + unsigned BasePos, OffsetPos; + if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) { + // New offset has already been validated; no need to do it again. + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is transformed to \n"; + }); + int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm; + UseMI->getOperand(OffsetPos).setImm(NewOffset); + UseMI->getOperand(BasePos).setReg(LoopVal); + LLVM_DEBUG(UseMI->dump()); + } + } + } + replaceWithPostInc(PostIncCandidate, LoopInst); + } + LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n"); +} + +bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0) + return false; + assert(AddMI->getOpcode() == Hexagon::A2_addi); + return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm()); +} + +void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode()); + assert(NewOpcode >= 0 && + "Couldn't change base offset to post-increment form"); + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + const MachineOperand &IncDest = AddMI->getOperand(0); + const MachineOperand &IncBase = AddMI->getOperand(1); + const MachineOperand &IncValue = AddMI->getOperand(2); + MachineInstrBuilder MIB; + LLVM_DEBUG({ + dbgs() << "\n\n"; + MI->dump(); + dbgs() << "\t is tranformed to post-inc form of \n"; + }); + + if (MI->mayLoad()) { + const MachineOperand &LDValue = MI->getOperand(0); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue); + } else { + const MachineOperand &STValue = MI->getOperand(2); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), *MI); + + LLVM_DEBUG({ + MIB->dump(); + dbgs() << "As a result this add instruction is erased.\n"; + AddMI->dump(); + }); + + MI->eraseFromParent(); + AddMI->eraseFromParent(); +} + +bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) { + // Algorithm: + // 1. Replace all the post-inc instructions with Base+Offset instruction and + // an add instruction in this block. + // 2. Fold all the adds in to respective uses. + // 3. Generate post-increment instructions and update the uses of the base + // register if needed based on constraints. + + replacePostIncWithBaseOffset(MBB); + foldAdds(MBB); + generatePostInc(MBB); + return true; +} + +bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) { + + // Skip pass if requested. + if (skipFunction(MF.getFunction())) + return false; + + // Get Target Information. + MLI = &getAnalysis<MachineLoopInfo>(); + HST = &MF.getSubtarget<HexagonSubtarget>(); + TRI = HST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + HII = HST->getInstrInfo(); + + // Skip this pass for TinyCore. + // Tiny core allwos partial post increment operations - This constraint can + // be imposed inside the pass. In a chain of post-increments, the first can + // be post-increment, rest can be adjusted to base+offset (these are + // inexpensive in most of the cases); + if (HST->isTinyCore()) + return false; + + LLVM_DEBUG({ + dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n"; + dbgs() << "Function: " << MF.getName() << "\n"; + }); + bool Change = false; + std::vector<MachineBasicBlock *> MLBB; + for (auto &BB : MF) { + // Check if this Basic Block belongs to any loop. + auto *LI = MLI->getLoopFor(&BB); + // We only deal with inner-most loops that has one block. + if (LI && LI->getBlocks().size() == 1) { + MachineBasicBlock *MBB = LI->getHeader(); + // Do not traverse blocks that are already visited. + if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end()) + continue; + + MLBB.push_back(MBB); + + LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n"); + Change |= translatePostIncsInLoop(*MBB); + } + } + LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n"); + return Change; +} + +FunctionPass *llvm::createHexagonPostIncOpt() { + return new HexagonPostIncOpt(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 7d4b420..a5ebd64 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::desc("Early expansion of MUX")); +static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true), + cl::Hidden, + cl::desc("Cleanup of TFRs/COPYs")); + static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden, cl::desc("Enable early if-conversion")); @@ -92,6 +96,10 @@ static cl::opt<bool> static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden, cl::desc("Disable splitting double registers")); +static cl::opt<bool> + EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden, + cl::desc("Generate absolute set instructions")); + static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true), cl::Hidden, cl::desc("Bit simplification")); @@ -121,6 +129,10 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden, cl::init(true), cl::desc("Enable instsimplify")); +static cl::opt<bool> DisableHexagonPostIncOpt( + "hexagon-postinc-opt", cl::Hidden, + cl::desc("Disable Hexagon post-increment optimization")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -145,20 +157,24 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", namespace llvm { extern char &HexagonExpandCondsetsID; + extern char &HexagonTfrCleanupID; void initializeHexagonBitSimplifyPass(PassRegistry&); void initializeHexagonConstExtendersPass(PassRegistry&); void initializeHexagonConstPropagationPass(PassRegistry&); void initializeHexagonCopyToCombinePass(PassRegistry&); void initializeHexagonEarlyIfConversionPass(PassRegistry&); void initializeHexagonExpandCondsetsPass(PassRegistry&); + void initializeHexagonGenMemAbsolutePass(PassRegistry &); void initializeHexagonGenMuxPass(PassRegistry&); void initializeHexagonHardwareLoopsPass(PassRegistry&); void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &); void initializeHexagonNewValueJumpPass(PassRegistry&); void initializeHexagonOptAddrModePass(PassRegistry&); void initializeHexagonPacketizerPass(PassRegistry&); + void initializeHexagonPostIncOptPass(PassRegistry &); void initializeHexagonRDFOptPass(PassRegistry&); void initializeHexagonSplitDoubleRegsPass(PassRegistry&); + void initializeHexagonTfrCleanupPass(PassRegistry &); void initializeHexagonVExtractPass(PassRegistry &); void initializeHexagonVectorCombineLegacyPass(PassRegistry&); void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &); @@ -177,6 +193,7 @@ namespace llvm { FunctionPass *createHexagonFixupHwLoops(); FunctionPass *createHexagonGenExtract(); FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenMemAbsolute(); FunctionPass *createHexagonGenMux(); FunctionPass *createHexagonGenPredicate(); FunctionPass *createHexagonHardwareLoops(); @@ -188,10 +205,12 @@ namespace llvm { FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(bool Minimal); FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonPostIncOpt(); FunctionPass *createHexagonRDFOpt(); FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonSplitDoubleRegs(); FunctionPass *createHexagonStoreWidening(); + FunctionPass *createHexagonTfrCleanup(); FunctionPass *createHexagonVectorCombineLegacyPass(); FunctionPass *createHexagonVectorPrint(); FunctionPass *createHexagonVExtract(); @@ -211,12 +230,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() { initializeHexagonConstPropagationPass(PR); initializeHexagonCopyToCombinePass(PR); initializeHexagonEarlyIfConversionPass(PR); + initializeHexagonGenMemAbsolutePass(PR); initializeHexagonGenMuxPass(PR); initializeHexagonHardwareLoopsPass(PR); initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR); initializeHexagonNewValueJumpPass(PR); initializeHexagonOptAddrModePass(PR); initializeHexagonPacketizerPass(PR); + initializeHexagonPostIncOptPass(PR); initializeHexagonRDFOptPass(PR); initializeHexagonSplitDoubleRegsPass(PR); initializeHexagonVectorCombineLegacyPass(PR); @@ -244,6 +265,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, (HexagonNoOpt ? CodeGenOptLevel::None : OL)), TLOF(std::make_unique<HexagonTargetObjectFile>()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); + initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry()); + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -411,11 +434,20 @@ void HexagonPassConfig::addPreRegAlloc() { addPass(createHexagonConstExtenders()); if (EnableExpandCondsets) insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID); + if (EnableTfrCleanup) + insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID); if (!DisableStoreWidening) addPass(createHexagonStoreWidening()); + if (EnableGenMemAbs) + addPass(createHexagonGenMemAbsolute()); if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops()); } + + if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive) + if (!DisableHexagonPostIncOpt) + addPass(createHexagonPostIncOpt()); + if (TM->getOptLevel() >= CodeGenOptLevel::Default) addPass(&MachinePipelinerID); } diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp new file mode 100644 index 0000000..a4b359a --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -0,0 +1,324 @@ +//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass is to address a situation that appears after register allocaion +// evey now and then, namely a register copy from a source that was defined +// as an immediate value in the same block (usually just before the copy). +// +// Here is an example of actual code emitted that shows this problem: +// +// .LBB0_5: +// { +// r5 = zxtb(r8) +// r6 = or(r6, ##12345) +// } +// { +// r3 = xor(r1, r2) +// r1 = #0 <-- r1 set to #0 +// } +// { +// r7 = r1 <-- r7 set to r1 +// r0 = zxtb(r3) +// } + +#define DEBUG_TYPE "tfr-cleanup" +#include "HexagonTargetMachine.h" + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace llvm { +FunctionPass *createHexagonTfrCleanup(); +void initializeHexagonTfrCleanupPass(PassRegistry &); +} // namespace llvm + +namespace { +class HexagonTfrCleanup : public MachineFunctionPass { +public: + static char ID; + HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) { + PassRegistry &R = *PassRegistry::getPassRegistry(); + initializeHexagonTfrCleanupPass(R); + } + StringRef getPassName() const override { return "Hexagon TFR Cleanup"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + const HexagonInstrInfo *HII; + const TargetRegisterInfo *TRI; + + typedef DenseMap<unsigned, uint64_t> ImmediateMap; + + bool isIntReg(unsigned Reg, bool &Is32); + void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap); + bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap); + bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap); + bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes); + bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes); +}; +} // namespace + +char HexagonTfrCleanup::ID = 0; + +namespace llvm { +char &HexagonTfrCleanupID = HexagonTfrCleanup::ID; +} + +bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) { + Is32 = Hexagon::IntRegsRegClass.contains(Reg); + return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg); +} + +// Assign given value V32 to the specified the register R32 in the map. Only +// 32-bit registers are valid arguments. +void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) { + ImmediateMap::iterator F = IMap.find(R32); + if (F == IMap.end()) + IMap.insert(std::make_pair(R32, V32)); + else + F->second = V32; +} + +// Retrieve a value of the provided register Reg and store it into Val. +// Return "true" if a value was found, "false" otherwise. +bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val, + ImmediateMap &IMap) { + bool Is32; + if (!isIntReg(Reg, Is32)) + return false; + + if (Is32) { + ImmediateMap::iterator F = IMap.find(Reg); + if (F == IMap.end()) + return false; + Val = F->second; + return true; + } + + // For 64-bit registers, compose the value from the values of its + // subregisters. + unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo); + unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi); + ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH); + if (FL == IMap.end() || FH == IMap.end()) + return false; + Val = (FH->second << 32) | FL->second; + return true; +} + +// Process an instruction and record the relevant information in the imme- +// diate map. +bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) { + using namespace Hexagon; + + if (MI->isCall()) { + IMap.clear(); + return true; + } + + // If this is an instruction that loads a constant into a register, + // record this information in IMap. + unsigned Opc = MI->getOpcode(); + if (Opc == A2_tfrsi || Opc == A2_tfrpi) { + unsigned DefR = MI->getOperand(0).getReg(); + bool Is32; + if (!isIntReg(DefR, Is32)) + return false; + if (!MI->getOperand(1).isImm()) { + if (!Is32) { + IMap.erase(TRI->getSubReg(DefR, isub_lo)); + IMap.erase(TRI->getSubReg(DefR, isub_hi)); + } else { + IMap.erase(DefR); + } + return false; + } + uint64_t Val = MI->getOperand(1).getImm(); + // If it's a 64-bit register, break it up into subregisters. + if (!Is32) { + uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU); + setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap); + setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap); + } else { + setReg(DefR, Val, IMap); + } + return true; + } + + // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap. + for (MachineInstr::mop_iterator Mo = MI->operands_begin(), + E = MI->operands_end(); + Mo != E; ++Mo) { + if (Mo->isRegMask()) { + IMap.clear(); + return true; + } + if (!Mo->isReg() || !Mo->isDef()) + continue; + unsigned R = Mo->getReg(); + for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) { + ImmediateMap::iterator F = IMap.find(*AR); + if (F != IMap.end()) + IMap.erase(F); + } + } + return true; +} + +// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that +// has a known constant value. +bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, + SlotIndexes *Indexes) { + using namespace Hexagon; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case A2_tfr: + case A2_tfrp: + case COPY: + break; + default: + return false; + } + + unsigned DstR = MI->getOperand(0).getReg(); + unsigned SrcR = MI->getOperand(1).getReg(); + bool Tmp, Is32; + if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp)) + return false; + assert(Tmp == Is32 && "Register size mismatch"); + uint64_t Val; + bool Found = getReg(SrcR, Val, IMap); + if (!Found) + return false; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + int64_t SVal = Is32 ? int32_t(Val) : Val; + auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>(); + MachineInstr *NewMI; + if (Is32) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal); + else if (isInt<8>(SVal)) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal); + else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL))) + NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR) + .addImm(int32_t(SVal >> 32)) + .addImm(int32_t(Val & 0xFFFFFFFFLL)); + else if (HST.isTinyCore()) + // Disable generating CONST64 since it requires load resource. + return false; + else + NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val); + + // Replace the MI to reuse the same slot index + if (Indexes) + Indexes->replaceMachineInstrInMaps(*MI, *NewMI); + MI->eraseFromParent(); + return true; +} + +// Remove the instruction if it is a self-assignment. +bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI, + SlotIndexes *Indexes) { + unsigned Opc = MI->getOpcode(); + unsigned DefR, SrcR; + bool IsUndef = false; + switch (Opc) { + case Hexagon::A2_tfr: + // Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(1).getReg(); + IsUndef = MI->getOperand(1).isUndef(); + break; + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + // if ([!]Pu) Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(2).getReg(); + IsUndef = MI->getOperand(2).isUndef(); + break; + default: + return false; + } + if (DefR != SrcR) + return false; + if (IsUndef) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR); + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef() && Op.isImplicit()) + DefI->addOperand(Op); + } + + if (Indexes) + Indexes->removeMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + return true; +} + +bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Map: 32-bit register -> immediate value. + // 64-bit registers are stored through their subregisters. + ImmediateMap IMap; + SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>(); + + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + HII = HST.getInstrInfo(); + TRI = HST.getRegisterInfo(); + + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock &B = *I; + MachineBasicBlock::iterator J, F, NextJ; + IMap.clear(); + bool Inserted = false, Erased = false; + for (J = B.begin(), F = B.end(); J != F; J = NextJ) { + NextJ = std::next(J); + MachineInstr *MI = &*J; + bool E = eraseIfRedundant(MI, Indexes); + Erased |= E; + if (E) + continue; + Inserted |= rewriteIfImm(MI, IMap, Indexes); + MachineBasicBlock::iterator NewJ = std::prev(NextJ); + updateImmMap(&*NewJ, IMap); + } + bool BlockC = Inserted | Erased; + Changed |= BlockC; + if (BlockC && Indexes) + Indexes->repairIndexesInRange(&B, B.begin(), B.end()); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// +INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false, + false) + +FunctionPass *llvm::createHexagonTfrCleanup() { + return new HexagonTfrCleanup(); +} diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index ca98269..9840412 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -18,6 +18,7 @@ #include "HexagonDepITypes.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/MC/MCInstrDesc.h" namespace llvm { @@ -48,7 +49,7 @@ namespace HexagonII { // MCInstrDesc TSFlags // *** Must match HexagonInstrFormat*.td *** - enum { + enum HexagonTSFlagsVal { // This 7-bit field describes the insn type. TypePos = 0, TypeMask = 0x7f, @@ -173,6 +174,11 @@ namespace HexagonII { hasUnaryRestrictionMask = 0x1, }; + inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos, + unsigned Mask) { + return (MID.TSFlags >> Pos) & Mask; + } + // *** The code above must match HexagonInstrFormat*.td *** // // Hexagon specific MO operand flag mask. @@ -275,6 +281,10 @@ namespace HexagonII { INST_ICLASS_ALU32_3 = 0xf0000000 }; + inline bool isCVI(const MCInstrDesc &MID) { + return getTSFlags(MID, isCVIPos, isCVIMask) != 0; + } + LLVM_ATTRIBUTE_UNUSED static unsigned getMemAccessSizeInBytes(MemAccessSize S) { switch (S) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ded2f25..3ff8994 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); + if (Opcode == NVPTX::StoreRetvalI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, @@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { NVPTX::StoreParamI8, NVPTX::StoreParamI16, NVPTX::StoreParamI32, NVPTX::StoreParamI64, NVPTX::StoreParamF32, NVPTX::StoreParamF64); + if (Opcode == NVPTX::StoreParamI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreParamI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7d2fe78..66a1010 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -59,6 +60,7 @@ #include <cmath> #include <cstdint> #include <iterator> +#include <optional> #include <sstream> #include <string> #include <utility> @@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } +static bool adjustElementType(EVT &ElementType) { + switch (ElementType.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::f16: + case MVT::bf16: + ElementType = MVT::i16; + return true; + case MVT::f32: + case MVT::v2f16: + case MVT::v2bf16: + ElementType = MVT::i32; + return true; + case MVT::f64: + ElementType = MVT::i64; + return true; + } +} + +// Use byte-store when the param address of the argument value is unaligned. +// This may happen when the return value is a field of a packed structure. +// +// This is called in LowerCall() when passing the param values. +static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue StVal, SDValue &InGlue, + unsigned ArgID, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); + + // Store each byte + SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal, InGlue}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode( + NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, + MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); + InGlue = Chain.getValue(1); + } + return Chain; +} + +// Use byte-load when the param adress of the returned value is unaligned. +// This may happen when the returned value is a field of a packed structure. +static SDValue +LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, + EVT ElementType, SDValue &InGlue, + SmallVectorImpl<SDValue> &TempProxyRegOps, + const SDLoc &dl) { + // Bit logic only works on integer types + EVT MergedType = ElementType; + adjustElementType(MergedType); + + // Load each byte and construct the whole value. Initial value to 0 + SDValue RetVal = DAG.getConstant(0, dl, MergedType); + // LoadParamMemI8 loads into i16 register only + SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + InGlue}; + // This will be selected to LoadParamMemI8 + SDValue LdVal = + DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, + MVT::i8, MachinePointerInfo(), Align(1)); + SDValue TmpLdVal = LdVal.getValue(0); + Chain = LdVal.getValue(1); + InGlue = LdVal.getValue(2); + + TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, + TmpLdVal.getSimpleValueType(), TmpLdVal); + TempProxyRegOps.push_back(TmpLdVal); + + SDValue CMask = DAG.getConstant(255, dl, MergedType); + SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); + // Need to extend the i16 register to the whole width. + TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); + // Mask off the high bits. Leave only the lower 8bits. + // Do this because we are using loadparam.b8. + TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); + // Shift and merge + TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); + RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); + } + if (ElementType != MergedType) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + return RetVal; +} + SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedAlign) PartAlign = commonAlignment(ArgAlign, CurOffset); - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back( - DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant( - IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), - dl, MVT::i32)); - } - SDValue StVal = OutVals[OIdx]; MVT PromotedVT; @@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar store. In such cases, fall back to byte stores. + if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && + PartAlign.value() < + DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + Chain = LowerUnalignedStoreParam( + DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, + StVal, InGlue, ParamCount, dl); + + // LowerUnalignedStoreParam took care of inserting the necessary nodes + // into the SDAG, so just move on to the next element. + if (!IsByVal) + ++OIdx; + continue; + } + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back( + DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); + + StoreOperands.push_back(DAG.getConstant( + IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), + dl, MVT::i32)); + } + // Record the value to store. StoreOperands.push_back(StVal); @@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 16> ProxyRegOps; SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; + // An item of the vector is filled if the element does not need a ProxyReg + // operation on it and should be added to InVals as is. ProxyRegOps and + // ProxyRegTruncates contain empty/none items at the same index. + SmallVector<SDValue, 16> RetElts; + // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` + // to use the values of `LoadParam`s and to be replaced later then + // `CALLSEQ_END` is added. + SmallVector<SDValue, 16> TempProxyRegOps; // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { @@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EltType = MVT::i16; } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar load. In such cases, fall back to byte loads. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && + EltAlign < DL.getABITypeAlign( + TheLoadType.getTypeForEVT(*DAG.getContext()))) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + SDValue Ret = LowerUnalignedLoadRetParam( + DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); + ProxyRegOps.push_back(SDValue()); + ProxyRegTruncates.push_back(std::optional<MVT>()); + RetElts.resize(i); + RetElts.push_back(Ret); + + continue; + } + // Record index of the very first element of the vector. if (VectorInfo[i] & PVF_FIRST) { assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); @@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { + if (i < RetElts.size() && RetElts[i]) { + InVals.push_back(RetElts[i]); + continue; + } + SDValue Ret = DAG.getNode( NVPTXISD::ProxyReg, dl, DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), @@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InVals.push_back(Ret); } + for (SDValue &T : TempProxyRegOps) { + SDValue Repl = DAG.getNode( + NVPTXISD::ProxyReg, dl, + DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), + {Chain, T.getOperand(0), InGlue}); + DAG.ReplaceAllUsesWith(T, Repl); + DAG.RemoveDeadNode(T.getNode()); + + Chain = Repl.getValue(1); + InGlue = Repl.getValue(2); + } + // set isTailCall to false for now, until we figure out how to express // tail call optimization in PTX isTailCall = false; @@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); Value *srcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + + const MaybeAlign PartAlign = [&]() -> MaybeAlign { + if (aggregateIsPacked) + return Align(1); + if (NumElts != 1) + return std::nullopt; + Align PartAlign = + (Offsets[parti] == 0 && PAL.getParamAlignment(i)) + ? PAL.getParamAlignment(i).value() + : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); + return commonAlignment(PartAlign, Offsets[parti]); + }(); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, - MachinePointerInfo(srcValue), - MaybeAlign(aggregateIsPacked ? 1 : 0), + MachinePointerInfo(srcValue), PartAlign, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); if (P.getNode()) @@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return Chain; } +// Use byte-store when the param adress of the return value is unaligned. +// This may happen when the return value is a field of a packed structure. +static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue RetVal, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + // Store each byte + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), StoreOperands, + MVT::i8, MachinePointerInfo(), std::nullopt, + MachineMemOperand::MOStore); + } + return Chain; +} + SDValue NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<SDValue, 6> StoreOperands; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - // New load/store. Record chain and offset operands. - if (VectorInfo[i] & PVF_FIRST) { - assert(StoreOperands.empty() && "Orphaned operand list."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); - } - SDValue OutVal = OutVals[i]; SDValue RetVal = PromotedOutVals[i]; @@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); } + // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned + // for a scalar store. In such cases, fall back to byte stores. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { + EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Align ElementTypeAlign = + DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); + Align ElementAlign = + commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); + if (ElementAlign < ElementTypeAlign) { + assert(StoreOperands.empty() && "Orphaned operand list."); + Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, + RetVal, dl); + + // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes + // into the graph, so just move on to the next element. + continue; + } + } + + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + // Record the value to return. StoreOperands.push_back(RetVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 55a1955..b3517ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; +def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">; +def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">; def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; @@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; +def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">; +def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">; def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 904f1d7..c922098 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = SubVecVT; // Establish the correct scalable-vector types for any fixed-length type. - if (SubVecVT.isFixedLengthVector()) + if (SubVecVT.isFixedLengthVector()) { + assert(Idx == 0 && V.isUndef()); SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); + } if (VT.isFixedLengthVector()) VT = TLI.getContainerForFixedLengthVector(VT); @@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = VT; // Establish the correct scalable-vector types for any fixed-length type. - if (VT.isFixedLengthVector()) + if (VT.isFixedLengthVector()) { + assert(Idx == 0); SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT); + } if (InVT.isFixedLengthVector()) InVT = TLI.getContainerForFixedLengthVector(InVT); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7275eb..540c2e7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND, ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, - ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE}; + ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, + ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, + ISD::VP_USUBSAT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, @@ -830,7 +832,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, VT, Custom); setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT, Custom); - setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal); @@ -956,6 +957,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // between vXf16 and vXf64 must be lowered as sequences which convert via // vXf32. setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); + setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); // Custom-lower insert/extract operations to simplify patterns. setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT, Custom); @@ -3240,45 +3242,49 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF, // Note that this method will also match potentially unappealing index // sequences, like <i32 0, i32 50939494>, however it is left to the caller to // determine whether this is worth generating code for. -static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { - unsigned NumElts = Op.getNumOperands(); +static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op, + unsigned EltSizeInBits) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR"); + if (!cast<BuildVectorSDNode>(Op)->isConstant()) + return std::nullopt; bool IsInteger = Op.getValueType().isInteger(); std::optional<unsigned> SeqStepDenom; std::optional<int64_t> SeqStepNum, SeqAddend; std::optional<std::pair<uint64_t, unsigned>> PrevElt; - unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits(); - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - // Assume undef elements match the sequence; we just have to be careful - // when interpolating across them. - if (Op.getOperand(Idx).isUndef()) + assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits()); + + // First extract the ops into a list of constant integer values. This may not + // be possible for floats if they're not all representable as integers. + SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands()); + const unsigned OpSize = Op.getScalarValueSizeInBits(); + for (auto [Idx, Elt] : enumerate(Op->op_values())) { + if (Elt.isUndef()) { + Elts[Idx] = std::nullopt; continue; - - uint64_t Val; + } if (IsInteger) { - // The BUILD_VECTOR must be all constants. - if (!isa<ConstantSDNode>(Op.getOperand(Idx))) - return std::nullopt; - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes<uint64_t>(EltSizeInBits); + Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize); } else { - // The BUILD_VECTOR must be all constants. - if (!isa<ConstantFPSDNode>(Op.getOperand(Idx))) - return std::nullopt; - if (auto ExactInteger = getExactInteger( - cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits)) - Val = *ExactInteger; - else + auto ExactInteger = + getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize); + if (!ExactInteger) return std::nullopt; + Elts[Idx] = *ExactInteger; } + } + + for (auto [Idx, Elt] : enumerate(Elts)) { + // Assume undef elements match the sequence; we just have to be careful + // when interpolating across them. + if (!Elt) + continue; if (PrevElt) { // Calculate the step since the last non-undef element, and ensure // it's consistent across the entire sequence. unsigned IdxDiff = Idx - PrevElt->second; - int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits); + int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits); // A zero-value value difference means that we're somewhere in the middle // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a @@ -3308,8 +3314,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { } // Record this non-undef element for later. - if (!PrevElt || PrevElt->first != Val) - PrevElt = std::make_pair(Val, Idx); + if (!PrevElt || PrevElt->first != *Elt) + PrevElt = std::make_pair(*Elt, Idx); } // We need to have logged a step for this to count as a legal index sequence. @@ -3318,21 +3324,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) { // Loop back through the sequence and validate elements we might have skipped // while waiting for a valid step. While doing this, log any sequence addend. - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - if (Op.getOperand(Idx).isUndef()) + for (auto [Idx, Elt] : enumerate(Elts)) { + if (!Elt) continue; - uint64_t Val; - if (IsInteger) { - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes<uint64_t>(EltSizeInBits); - } else { - Val = *getExactInteger( - cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits); - } uint64_t ExpectedVal = (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom; - int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits); + int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits); if (!SeqAddend) SeqAddend = Addend; else if (Addend != SeqAddend) @@ -3598,7 +3595,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. - if (auto SimpleVID = isSimpleVIDSequence(Op)) { + if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) { int64_t StepNumerator = SimpleVID->StepNumerator; unsigned StepDenominator = SimpleVID->StepDenominator; int64_t Addend = SimpleVID->Addend; @@ -3853,11 +3850,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // If we're compiling for an exact VLEN value, we can split our work per // register in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) { + if (const auto VLen = Subtarget.getRealVLen(); + VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) { MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); @@ -4768,9 +4764,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, // If we don't know exact data layout, not much we can do. If this // is already m1 or smaller, no point in splitting further. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen) + const auto VLen = Subtarget.getRealVLen(); + if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen) return SDValue(); // Avoid picking up bitrotate patterns which we have a linear-in-lmul @@ -4781,7 +4776,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, return SDValue(); MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned VRegsPerSrc = NumElts / ElemsPerVReg; SmallVector<std::pair<int, SmallVector<int>>> @@ -5759,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) { VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP VP_CASE(BITREVERSE) // VP_BITREVERSE + VP_CASE(SADDSAT) // VP_SADDSAT + VP_CASE(UADDSAT) // VP_UADDSAT + VP_CASE(SSUBSAT) // VP_SSUBSAT + VP_CASE(USUBSAT) // VP_USUBSAT VP_CASE(BSWAP) // VP_BSWAP VP_CASE(CTLZ) // VP_CTLZ VP_CASE(CTTZ) // VP_CTTZ @@ -6798,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_UDIV: case ISD::VP_SREM: case ISD::VP_UREM: + case ISD::VP_UADDSAT: + case ISD::VP_USUBSAT: + case ISD::VP_SADDSAT: + case ISD::VP_SSUBSAT: return lowerVPOp(Op, DAG); case ISD::VP_AND: case ISD::VP_OR: @@ -7384,6 +7387,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) return V; + // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1) + // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2) + if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) { + const APInt &TrueVal = TrueV->getAsAPIntVal(); + const APInt &FalseVal = FalseV->getAsAPIntVal(); + const int TrueValCost = RISCVMatInt::getIntMatCost( + TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + const int FalseValCost = RISCVMatInt::getIntMatCost( + FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + bool IsCZERO_NEZ = TrueValCost <= FalseValCost; + SDValue LHSVal = DAG.getConstant( + IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT); + SDValue RHSVal = + DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT); + SDValue CMOV = + DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ, + DL, VT, LHSVal, CondV); + return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal); + } + // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c)) // Unless we have the short forward branch optimization. if (!Subtarget.hasConditionalMoveFusion()) @@ -8313,15 +8336,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, // constant index, we can always perform the extract in m1 (or // smaller) as we can determine the register corresponding to // the index in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx); - IdxC && MinVLen == MaxVLen && - VecVT.getSizeInBits().getKnownMinValue() > MinVLen) { + IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) { MVT M1VT = getLMUL1VT(ContainerVT); unsigned OrigIdx = IdxC->getZExtValue(); EVT ElemVT = VecVT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned RemIdx = OrigIdx % ElemsPerVReg; unsigned SubRegIdx = OrigIdx / ElemsPerVReg; unsigned ExtractIdx = @@ -9782,15 +9803,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, if (OrigIdx == 0) return Op; - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); // If the subvector vector is a fixed-length type and we don't know VLEN // exactly, we cannot use subregister manipulation to simplify the codegen; we // don't know which register of a LMUL group contains the specific subvector // as we only know the minimum register size. Therefore we must slide the // vector group down the full amount. - if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) { + if (SubVecVT.isFixedLengthVector() && !VLen) { MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(VecVT); @@ -9837,8 +9857,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if // we have a fixed length subvector, we need to adjust the index by 1/vscale. if (SubVecVT.isFixedLengthVector()) { - assert(MinVLen == MaxVLen); - unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock; + assert(VLen); + unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; auto Decompose = RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI); @@ -12872,6 +12892,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineSubOfBoolean(N, DAG)) return V; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1) @@ -12879,7 +12900,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, isNullConstant(N1.getOperand(1))) { ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get(); if (CCVal == ISD::SETLT) { - EVT VT = N->getValueType(0); SDLoc DL(N); unsigned ShAmt = N0.getValueSizeInBits() - 1; return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), @@ -12887,6 +12907,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, } } + // sub (zext, zext) -> sext (sub (zext, zext)) + // where the sum of the extend widths match, and the inner zexts + // add at least one bit. (For profitability on rvv, we use a + // power of two for both inner and outer extend.) + if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) && + N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND && + N0.hasOneUse() && N1.hasOneUse()) { + SDValue Src0 = N0.getOperand(0); + SDValue Src1 = N1.getOperand(0); + EVT SrcVT = Src0.getValueType(); + if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) && + SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 && + SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) { + LLVMContext &C = *DAG.getContext(); + EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C); + EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount()); + Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0); + Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, + DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1)); + } + } + // fold (sub x, (select lhs, rhs, cc, 0, y)) -> // (select lhs, rhs, cc, x, (sub x, y)) return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget); @@ -15978,7 +16021,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (Index.getOpcode() == ISD::BUILD_VECTOR && MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) { - if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index); + // The sequence will be XLenVT, not the type of Index. Tell + // isSimpleVIDSequence this so we avoid overflow. + if (std::optional<VIDSequence> SimpleVID = + isSimpleVIDSequence(Index, Subtarget.getXLen()); SimpleVID && SimpleVID->StepDenominator == 1) { const int64_t StepNumerator = SimpleVID->StepNumerator; const int64_t Addend = SimpleVID->Addend; diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index ff21fe1..af864ba 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) { // Return true if MI is a load for which there exists a compressed version. static bool isCompressibleLoad(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) || - Opcode == RISCV::LD || Opcode == RISCV::FLD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::LW: + case RISCV::LD: + return STI.hasStdExtCOrZca(); + case RISCV::FLW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FLD: + return STI.hasStdExtCOrZcd(); + } } // Return true if MI is a store for which there exists a compressed version. static bool isCompressibleStore(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) || - Opcode == RISCV::SD || Opcode == RISCV::FSD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::SW: + case RISCV::SD: + return STI.hasStdExtCOrZca(); + case RISCV::FSW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FSD: + return STI.hasStdExtCOrZcd(); + } } // Find a single register and/or large offset which, if compressible, would @@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) { const RISCVInstrInfo &TII = *STI.getInstrInfo(); // This optimization only makes sense if compressed instructions are emitted. - // FIXME: Support Zca, Zcf, Zcd granularity. - if (!STI.hasStdExtC()) + if (!STI.hasStdExtCOrZca()) return false; for (MachineBasicBlock &MBB : Fn) { diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index d15cb61..0be681d 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, let ReleaseAtCycles = noPredReleaseCycles; } + // Define SchedVars + def nameMX # PredSchedVar + : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>; + def nameMX # NoPredSchedVar + : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>; + // Allow multiclass to refer to SchedVars -- need to have NAME prefix. + defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar); + defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar); + // Tie behavior to predicate - def NAME # nameMX # "_Variant" : SchedWriteVariant<[ - SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>, - SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]> - ]>; + def NAME # nameMX # "_Variant" + : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< !cast<SchedReadWrite>(nameMX), !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>; if IsWorstCase then { - def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[ - SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>, - SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]> - ]>; + def NAME # name # "_WorstCase_Variant" + : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< !cast<SchedReadWrite>(name # "_WorstCase"), !cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b60d7a..9ebf278 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -143,6 +143,10 @@ public: #include "RISCVGenSubtargetInfo.inc" bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; } + bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; } + bool hasStdExtCOrZcfOrZce() const { + return HasStdExtC || HasStdExtZcf || HasStdExtZce; + } bool hasStdExtZvl() const { return ZvlLen != 0; } bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; } bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index adef40e..3e20e45 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination( static cl::opt<bool> EnableSinkFold("riscv-enable-sink-fold", cl::desc("Enable sinking and folding of instruction copies"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden, diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index e6e3560..28a63b9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> { !eq(operation, OpGroupNonUniformShuffleDown), !eq(operation, OpGroupBroadcast), !eq(operation, OpGroupNonUniformBroadcast), - !eq(operation, OpGroupNonUniformBroadcastFirst)); + !eq(operation, OpGroupNonUniformBroadcastFirst), + !eq(operation, OpGroupNonUniformRotateKHR)); bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical); } @@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>; defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>; +// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate +defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>; +defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>; + // cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>; defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>; diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index cc438b2..10569ef 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) { static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, SPIRVGlobalRegistry *GR, - MachineIRBuilder &MIRBuilder) { + MachineIRBuilder &MIRBuilder, + const SPIRVSubtarget &ST) { // Read argument's access qualifier from metadata or default. SPIRV::AccessQualifier::AccessQualifier ArgAccessQual = getArgAccessQual(F, ArgIdx); @@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, if (MDTypeStr.ends_with("*")) ResArgType = GR->getOrCreateSPIRVTypeByName( MDTypeStr, MIRBuilder, - addressSpaceToStorageClass( - OriginalArgType->getPointerAddressSpace())); + addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(), + ST)); else if (MDTypeStr.ends_with("_t")) ResArgType = GR->getOrCreateSPIRVTypeByName( "opencl." + MDTypeStr.str(), MIRBuilder, @@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, assert(GR && "Must initialize the SPIRV type registry before lowering args."); GR->setCurrentFunc(MIRBuilder.getMF()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); + // Assign types and names to all args, and store their types for later. FunctionType *FTy = getOriginalFunctionType(F); SmallVector<SPIRVType *, 4> ArgTypeVRegs; @@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (VRegs[i].size() > 1) return false; - auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder); + auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST); GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF()); ArgTypeVRegs.push_back(SpirvTy); @@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (F.hasName()) buildOpName(FuncVReg, F.getName(), MIRBuilder); - // Get access to information about available extensions - const auto *ST = - static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); - // Handle entry points and function linkage. if (isEntryPoint(F)) { const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>(); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 47fec74..a1cb630 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType( // TODO: change the implementation once opaque pointers are supported // in the SPIR-V specification. SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder); - auto SC = addressSpaceToStorageClass(PType->getAddressSpace()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget()); + auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST); // Null pointer means we have a loop in type definitions, make and // return corresponding OpTypeForwardPointer. if (SpvElementType == nullptr) { diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h index f317b26..d34f802 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h @@ -31,6 +31,9 @@ public: return true; } + // prevent creation of jump tables + bool areJTsAllowed(const Function *) const override { return false; } + // This is to prevent sexts of non-i64 vector indices which are generated // within general IRTranslator hence type generation for it is omitted. MVT getVectorIdxTy(const DataLayout &DL) const override { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 0f11bc3..7c5252e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor "$r = OpGenericCastToPtrExplicit $t $p $s">; def OpBitcast : UnOp<"OpBitcast", 124>; +// SPV_INTEL_usm_storage_classes +def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>; +def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>; + // 3.42.12 Composite Instructions def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx), @@ -765,6 +769,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>; def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>; def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>; +// SPV_KHR_subgroup_rotate +def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res), + (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops), + "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">; + // 3.49.7, Constant-Creation Instructions // - SPV_INTEL_function_pointers diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 53d19a1..7258d3b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) { } } +static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) { + switch (SC) { + case SPIRV::StorageClass::DeviceOnlyINTEL: + case SPIRV::StorageClass::HostOnlyINTEL: + return true; + default: + return false; + } +} + // In SPIR-V address space casting can only happen to and from the Generic -// storage class. We can also only case Workgroup, CrossWorkgroup, or Function +// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function // pointers to and from Generic pointers. As such, we can convert e.g. from // Workgroup to Function by going via a Generic pointer as an intermediary. All // other combinations can only be done by a bitcast, and are probably not safe. @@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr); SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg); - // Casting from an eligable pointer to Generic. + // don't generate a cast between identical storage classes + if (SrcSC == DstSC) + return true; + + // Casting from an eligible pointer to Generic. if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric); - // Casting from Generic to an eligable pointer. + // Casting from Generic to an eligible pointer. if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr); - // Casting between 2 eligable pointers using Generic as an intermediary. + // Casting between 2 eligible pointers using Generic as an intermediary. if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) { Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass); SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType( @@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, .addUse(Tmp) .constrainAllUses(TII, TRI, RBI); } + + // Check if instructions from the SPV_INTEL_usm_storage_classes extension may + // be applied + if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpPtrCastToCrossWorkgroupINTEL); + if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC)) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpCrossWorkgroupCastToPtrINTEL); + // TODO Should this case just be disallowed completely? // We're casting 2 other arbitrary address spaces, so have to bitcast. return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast); @@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( } SPIRVType *ResType = GR.getOrCreateSPIRVPointerType( PointerBaseType, I, TII, - addressSpaceToStorageClass(GV->getAddressSpace())); + addressSpaceToStorageClass(GV->getAddressSpace(), STI)); std::string GlobalIdent; if (!GV->hasName()) { @@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( unsigned AddrSpace = GV->getAddressSpace(); SPIRV::StorageClass::StorageClass Storage = - addressSpaceToStorageClass(AddrSpace); + addressSpaceToStorageClass(AddrSpace, STI); bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage && Storage != SPIRV::StorageClass::Function; SPIRV::LinkageType::LinkageType LnkType = diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 011a550..4f2e7a2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { const LLT p2 = LLT::pointer(2, PSize); // UniformConstant const LLT p3 = LLT::pointer(3, PSize); // Workgroup const LLT p4 = LLT::pointer(4, PSize); // Generic - const LLT p5 = LLT::pointer(5, PSize); // Input + const LLT p5 = + LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device) + const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host) // TODO: remove copy-pasting here by using concatenation in some way. auto allPtrsScalarsAndVectors = { - p0, p1, p2, p3, p4, p5, s1, s8, s16, - s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, - v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, - v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; + p0, p1, p2, p3, p4, p5, p6, s1, s8, s16, + s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16, + v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16, + v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; auto allScalarsAndVectors = { s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, @@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { auto allFloatAndIntScalars = allIntScalars; - auto allPtrs = {p0, p1, p2, p3, p4, p5}; - auto allWritablePtrs = {p0, p1, p3, p4}; + auto allPtrs = {p0, p1, p2, p3, p4, p5, p6}; + auto allWritablePtrs = {p0, p1, p3, p4, p5, p6}; for (auto Opc : TypeFoldingSupportingOpcs) getActionDefinitionsBuilder(Opc).custom(); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index dbda287..3be28c9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1063,12 +1063,28 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR); } break; + case SPIRV::OpPtrCastToCrossWorkgroupINTEL: + case SPIRV::OpCrossWorkgroupCastToPtrINTEL: + if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes); + Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL); + } + break; case SPIRV::OpConstantFunctionPointerINTEL: if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) { Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers); Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL); } break; + case SPIRV::OpGroupNonUniformRotateKHR: + if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate)) + report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the " + "following SPIR-V extension: SPV_KHR_subgroup_rotate", + false); + Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate); + Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR); + Reqs.addCapability(SPIRV::Capability::GroupNonUniform); + break; case SPIRV::OpGroupIMulKHR: case SPIRV::OpGroupFMulKHR: case SPIRV::OpGroupBitwiseAndKHR: diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index cbc16fa..1442168 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) { static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget()); SmallVector<MachineInstr *, 10> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(4).getImm())); + addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST)); // If the bitcast would be redundant, replace all uses with the source // register. @@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<MachineInstr *, 10> ToErase; @@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(3).getImm())); + addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST)); MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index e186154..79f1614 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions( clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone", "Adds OptNoneINTEL value for Function Control mask that " "indicates a request to not optimize the function."), + clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes, + "SPV_INTEL_usm_storage_classes", + "Introduces two new storage classes that are sub classes of " + "the CrossWorkgroup storage class " + "that provides additional information that can enable " + "optimization."), clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups", "Allows work items in a subgroup to share data without the " "use of local memory and work group barriers, and to " @@ -75,6 +81,10 @@ cl::list<SPIRV::Extension::Extension> Extensions( "Allows to use the LinkOnceODR linkage type that is to let " "a function or global variable to be merged with other functions " "or global variables of the same name when linkage occurs."), + clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate, + "SPV_KHR_subgroup_rotate", + "Adds a new instruction that enables rotating values across " + "invocations within a subgroup."), clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers, "SPV_INTEL_function_pointers", "Allows translation of function pointers."))); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 4e5ac0d..b022b97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>; defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>; defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>; +defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>; defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>; @@ -462,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; +defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -699,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>; defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>; defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>; defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>; +defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>; +defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Dim enum values and at the same time diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 05f766d..169d7cc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "SPIRV.h" #include "SPIRVInstrInfo.h" +#include "SPIRVSubtarget.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) { return 3; case SPIRV::StorageClass::Generic: return 4; + case SPIRV::StorageClass::DeviceOnlyINTEL: + return 5; + case SPIRV::StorageClass::HostOnlyINTEL: + return 6; case SPIRV::StorageClass::Input: return 7; default: - llvm_unreachable("Unable to get address space id"); + report_fatal_error("Unable to get address space id"); } } SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace) { +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) { switch (AddrSpace) { case 0: return SPIRV::StorageClass::Function; @@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) { return SPIRV::StorageClass::Workgroup; case 4: return SPIRV::StorageClass::Generic; + case 5: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::DeviceOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; + case 6: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::HostOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; case 7: return SPIRV::StorageClass::Input; default: - llvm_unreachable("Unknown address space"); + report_fatal_error("Unknown address space"); } } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index a33dc02..1af53dc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -27,6 +27,7 @@ class MachineRegisterInfo; class Register; class StringRef; class SPIRVInstrInfo; +class SPIRVSubtarget; // Add the given string as a series of integer operand, inserting null // terminators and padding to make sure the operands all have 32-bit @@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC); // Convert an LLVM IR address space to a SPIR-V storage class. SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace); +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI); SPIRV::MemorySemantics::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7c47790..36f0679 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -43,6 +43,8 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower" +extern cl::opt<bool> WasmEmitMultiValue; + WebAssemblyTargetLowering::WebAssemblyTargetLowering( const TargetMachine &TM, const WebAssemblySubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn( const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext & /*Context*/) const { // WebAssembly can only handle returning tuples with multivalue enabled - return Subtarget->hasMultivalue() || Outs.size() <= 1; + return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1; } SDValue WebAssemblyTargetLowering::LowerReturn( @@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn( const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - assert((Subtarget->hasMultivalue() || Outs.size() <= 1) && + assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) || + Outs.size() <= 1) && "MVP WebAssembly can only return up to one value"); if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 1e95911..b969b83 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -22,6 +22,8 @@ #include "llvm/Target/TargetMachine.h" using namespace llvm; +extern cl::opt<bool> WasmEmitMultiValue; + WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor. MachineFunctionInfo *WebAssemblyFunctionInfo::clone( @@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); if (Results.size() > 1 && - !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) { + (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() || + !WasmEmitMultiValue)) { // WebAssembly can't lower returns of multiple values without demoting to // sret unless multivalue is enabled (see // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 3e2e029..2a84c90 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -24,6 +24,8 @@ using namespace llvm; +extern cl::opt<bool> WasmEmitMultiValue; + namespace { enum RuntimeLibcallSignature { @@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_func_f32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F32); break; case i64_i64_func_f64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F64); break; case i16_i16_func_i16_i16: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i32_i32_func_i32_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_iPTR: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); @@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 42043a7..3120b6b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass( " irreducible control flow optimization pass"), cl::init(false)); +// A temporary option to control emission of multivalue until multivalue +// implementation is stable enough. We currently don't emit multivalue by +// default even if the feature section allows it. +// TODO Stabilize multivalue and delete this option +cl::opt<bool> + WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden, + cl::desc("WebAssembly: Emit multivalue in the backend"), + cl::init(false)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() { // Register the target. RegisterTargetMachine<WebAssemblyTargetMachine> X( diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index 4a11dd2..a620ba9 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const { } // namespace Error X86TargetMachine::buildCodeGenPipeline( - ModulePassManager &MPM, MachineFunctionPassManager &MFPM, - MachineFunctionAnalysisManager &, raw_pwrite_stream &Out, - raw_pwrite_stream *DwoOut, CodeGenFileType FileType, - CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) { + ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, + CodeGenFileType FileType, CGPassBuilderOption Opt, + PassInstrumentationCallbacks *PIC) { auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC); - return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType); + return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index f31c971..0fd3e47 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -58,10 +58,9 @@ public: createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; - Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &, - MachineFunctionAnalysisManager &, - raw_pwrite_stream &, raw_pwrite_stream *, - CodeGenFileType, CGPassBuilderOption, + Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &, + raw_pwrite_stream *, CodeGenFileType, + CGPassBuilderOption, PassInstrumentationCallbacks *) override; bool isJIT() const { return IsJIT; } diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 4466d50..a4cc757 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1846,6 +1846,13 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1); Features["usermsr"] = HasLeaf7Subleaf1 && ((EDX >> 15) & 1); Features["avx10.1-256"] = HasLeaf7Subleaf1 && ((EDX >> 19) & 1); + bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1); + Features["egpr"] = HasAPXF; + Features["push2pop2"] = HasAPXF; + Features["ppx"] = HasAPXF; + Features["ndd"] = HasAPXF; + Features["ccmp"] = HasAPXF; + Features["cf"] = HasAPXF; bool HasLeafD = MaxLevel >= 0xd && !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index fabb3c5f..5aefcbf 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -215,15 +215,10 @@ bool HotColdSplitting::isFunctionCold(const Function &F) const { return false; } -bool HotColdSplitting::isBasicBlockCold(BasicBlock *BB, - BranchProbability ColdProbThresh, - SmallPtrSetImpl<BasicBlock *> &ColdBlocks, - SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks, - BlockFrequencyInfo *BFI) const { - // This block is already part of some outlining region. - if (ColdBlocks.count(BB)) - return true; - +bool HotColdSplitting::isBasicBlockCold( + BasicBlock *BB, BranchProbability ColdProbThresh, + SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks, + BlockFrequencyInfo *BFI) const { if (BFI) { if (PSI->isColdBlock(BB, BFI)) return true; @@ -372,18 +367,12 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region, return Penalty; } -Function *HotColdSplitting::extractColdRegion( - const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC, - DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, - OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) { +// Determine if it is beneficial to split the \p Region. +bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE, + const BlockSequence &Region, + TargetTransformInfo &TTI) { assert(!Region.empty()); - // TODO: Pass BFI and BPI to update profile information. - CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, - /* BPI */ nullptr, AC, /* AllowVarArgs */ false, - /* AllowAlloca */ false, /* AllocaBlock */ nullptr, - /* Suffix */ "cold." + std::to_string(Count)); - // Perform a simple cost/benefit analysis to decide whether or not to permit // splitting. SetVector<Value *> Inputs, Outputs, Sinks; @@ -394,9 +383,18 @@ Function *HotColdSplitting::extractColdRegion( LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit << ", penalty = " << OutliningPenalty << "\n"); if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty) - return nullptr; + return false; - Function *OrigF = Region[0]->getParent(); + return true; +} + +// Split the single \p EntryPoint cold region. \p CE is the region code +// extractor. +Function *HotColdSplitting::extractColdRegion( + BasicBlock &EntryPoint, CodeExtractor &CE, + const CodeExtractorAnalysisCache &CEAC, BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE) { + Function *OrigF = EntryPoint.getParent(); if (Function *OutF = CE.extractCodeRegion(CEAC)) { User *U = *OutF->user_begin(); CallInst *CI = cast<CallInst>(U); @@ -419,7 +417,7 @@ Function *HotColdSplitting::extractColdRegion( LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "HotColdSplit", - &*Region[0]->begin()) + &*EntryPoint.begin()) << ore::NV("Original", OrigF) << " split cold code into " << ore::NV("Split", OutF); }); @@ -428,9 +426,9 @@ Function *HotColdSplitting::extractColdRegion( ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", - &*Region[0]->begin()) + &*EntryPoint.begin()) << "Failed to extract region at block " - << ore::NV("Block", Region.front()); + << ore::NV("Block", &EntryPoint); }); return nullptr; } @@ -620,16 +618,18 @@ public: } // namespace bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { - bool Changed = false; - - // The set of cold blocks. + // The set of cold blocks outlined. SmallPtrSet<BasicBlock *, 4> ColdBlocks; + // The set of cold blocks cannot be outlined. + SmallPtrSet<BasicBlock *, 4> CannotBeOutlinedColdBlocks; + // Set of cold blocks obtained with RPOT. SmallPtrSet<BasicBlock *, 4> AnnotatedColdBlocks; - // The worklist of non-intersecting regions left to outline. - SmallVector<OutliningRegion, 2> OutliningWorklist; + // The worklist of non-intersecting regions left to outline. The first member + // of the pair is the entry point into the region to be outlined. + SmallVector<std::pair<BasicBlock *, CodeExtractor>, 2> OutliningWorklist; // Set up an RPO traversal. Experimentally, this performs better (outlines // more) than a PO traversal, because we prevent region overlap by keeping @@ -655,10 +655,18 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { if (ColdBranchProbDenom.getNumOccurrences()) ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue()); + unsigned OutlinedFunctionID = 1; // Find all cold regions. for (BasicBlock *BB : RPOT) { - if (!isBasicBlockCold(BB, ColdProbThresh, ColdBlocks, AnnotatedColdBlocks, - BFI)) + // This block is already part of some outlining region. + if (ColdBlocks.count(BB)) + continue; + + // This block is already part of some region cannot be outlined. + if (CannotBeOutlinedColdBlocks.count(BB)) + continue; + + if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI)) continue; LLVM_DEBUG({ @@ -681,50 +689,69 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { return markFunctionCold(F); } - // If this outlining region intersects with another, drop the new region. - // - // TODO: It's theoretically possible to outline more by only keeping the - // largest region which contains a block, but the extra bookkeeping to do - // this is tricky/expensive. - bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) { - return !ColdBlocks.insert(Block.first).second; - }); - if (RegionsOverlap) - continue; + do { + BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); + LLVM_DEBUG({ + dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; + for (BasicBlock *BB : SubRegion) + BB->dump(); + }); + + // TODO: Pass BFI and BPI to update profile information. + CodeExtractor CE( + SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr, + /* BPI */ nullptr, AC, /* AllowVarArgs */ false, + /* AllowAlloca */ false, /* AllocaBlock */ nullptr, + /* Suffix */ "cold." + std::to_string(OutlinedFunctionID)); + + if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) && + // If this outlining region intersects with another, drop the new + // region. + // + // TODO: It's theoretically possible to outline more by only keeping + // the largest region which contains a block, but the extra + // bookkeeping to do this is tricky/expensive. + none_of(SubRegion, [&](BasicBlock *Block) { + return ColdBlocks.contains(Block); + })) { + ColdBlocks.insert(SubRegion.begin(), SubRegion.end()); + + LLVM_DEBUG({ + for (auto *Block : SubRegion) + dbgs() << " contains cold block:" << Block->getName() << "\n"; + }); + + OutliningWorklist.emplace_back( + std::make_pair(SubRegion[0], std::move(CE))); + ++OutlinedFunctionID; + } else { + // The cold block region cannot be outlined. + for (auto *Block : SubRegion) + if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) || + (PDT->dominates(BB, Block) && DT->dominates(Block, BB))) + // Will skip this cold block in the loop to save the compile time + CannotBeOutlinedColdBlocks.insert(Block); + } + } while (!Region.empty()); - OutliningWorklist.emplace_back(std::move(Region)); ++NumColdRegionsFound; } } if (OutliningWorklist.empty()) - return Changed; + return false; // Outline single-entry cold regions, splitting up larger regions as needed. - unsigned OutlinedFunctionID = 1; // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. CodeExtractorAnalysisCache CEAC(F); - do { - OutliningRegion Region = OutliningWorklist.pop_back_val(); - assert(!Region.empty() && "Empty outlining region in worklist"); - do { - BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); - LLVM_DEBUG({ - dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; - for (BasicBlock *BB : SubRegion) - BB->dump(); - }); - - Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI, - ORE, AC, OutlinedFunctionID); - if (Outlined) { - ++OutlinedFunctionID; - Changed = true; - } - } while (!Region.empty()); - } while (!OutliningWorklist.empty()); + for (auto &BCE : OutliningWorklist) { + Function *Outlined = + extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE); + assert(Outlined && "Should be outlined"); + (void)Outlined; + } - return Changed; + return true; } bool HotColdSplitting::run(Module &M) { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 4176d56..77ca36d 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1471,7 +1471,6 @@ private: OMPRTL_omp_get_num_threads, OMPRTL_omp_in_parallel, OMPRTL_omp_get_cancellation, - OMPRTL_omp_get_thread_limit, OMPRTL_omp_get_supported_active_levels, OMPRTL_omp_get_level, OMPRTL_omp_get_ancestor_thread_num, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index ed47de2..33ed1d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1543,11 +1543,14 @@ static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { return !losesInfo; } -static Type *shrinkFPConstant(ConstantFP *CFP) { +static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) return nullptr; // No constant folding of this. + // See if the value can be truncated to bfloat and then reextended. + if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat())) + return Type::getBFloatTy(CFP->getContext()); // See if the value can be truncated to half and then reextended. - if (fitsInFPType(CFP, APFloat::IEEEhalf())) + if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf())) return Type::getHalfTy(CFP->getContext()); // See if the value can be truncated to float and then reextended. if (fitsInFPType(CFP, APFloat::IEEEsingle())) @@ -1562,7 +1565,7 @@ static Type *shrinkFPConstant(ConstantFP *CFP) { // Determine if this is a vector of ConstantFPs and if so, return the minimal // type we can safely truncate all elements to. -static Type *shrinkFPConstantVector(Value *V) { +static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) { auto *CV = dyn_cast<Constant>(V); auto *CVVTy = dyn_cast<FixedVectorType>(V->getType()); if (!CV || !CVVTy) @@ -1582,7 +1585,7 @@ static Type *shrinkFPConstantVector(Value *V) { if (!CFP) return nullptr; - Type *T = shrinkFPConstant(CFP); + Type *T = shrinkFPConstant(CFP, PreferBFloat); if (!T) return nullptr; @@ -1597,7 +1600,7 @@ static Type *shrinkFPConstantVector(Value *V) { } /// Find the minimum FP type we can safely truncate to. -static Type *getMinimumFPType(Value *V) { +static Type *getMinimumFPType(Value *V, bool PreferBFloat) { if (auto *FPExt = dyn_cast<FPExtInst>(V)) return FPExt->getOperand(0)->getType(); @@ -1605,7 +1608,7 @@ static Type *getMinimumFPType(Value *V) { // that can accurately represent it. This allows us to turn // (float)((double)X+2.0) into x+2.0f. if (auto *CFP = dyn_cast<ConstantFP>(V)) - if (Type *T = shrinkFPConstant(CFP)) + if (Type *T = shrinkFPConstant(CFP, PreferBFloat)) return T; // We can only correctly find a minimum type for a scalable vector when it is @@ -1617,7 +1620,7 @@ static Type *getMinimumFPType(Value *V) { // Try to shrink a vector of FP constants. This returns nullptr on scalable // vectors - if (Type *T = shrinkFPConstantVector(V)) + if (Type *T = shrinkFPConstantVector(V, PreferBFloat)) return T; return V->getType(); @@ -1686,8 +1689,10 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Type *Ty = FPT.getType(); auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); if (BO && BO->hasOneUse()) { - Type *LHSMinType = getMinimumFPType(BO->getOperand(0)); - Type *RHSMinType = getMinimumFPType(BO->getOperand(1)); + Type *LHSMinType = + getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy()); + Type *RHSMinType = + getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy()); unsigned OpWidth = BO->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 4af455c..87c8dca 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2387,6 +2387,20 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses, return NonNull; } + if (match(V, m_SExtLike(m_Value(A)))) { + if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder, + DoesConsume, Depth)) + return Builder ? Builder->CreateSExt(AV, V->getType()) : NonNull; + return nullptr; + } + + if (match(V, m_Trunc(m_Value(A)))) { + if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder, + DoesConsume, Depth)) + return Builder ? Builder->CreateTrunc(AV, V->getType()) : NonNull; + return nullptr; + } + return nullptr; } diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 393afc9..33add6d 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -348,7 +348,7 @@ private: void instrumentGlobals(); Value *getPC(IRBuilder<> &IRB); - Value *getSP(IRBuilder<> &IRB); + Value *getFP(IRBuilder<> &IRB); Value *getFrameRecordInfo(IRBuilder<> &IRB); void instrumentPersonalityFunctions(); @@ -1148,7 +1148,7 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) { // Extract some entropy from the stack pointer for the tags. // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ // between functions). - Value *StackPointerLong = getSP(IRB); + Value *StackPointerLong = getFP(IRB); Value *StackTag = applyTagMask(IRB, IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20))); @@ -1165,7 +1165,7 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag, } Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) { - Value *StackPointerLong = getSP(IRB); + Value *StackPointerLong = getFP(IRB); Value *UARTag = applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift)); @@ -1232,7 +1232,7 @@ Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) { return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy); } -Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { +Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) { if (!CachedSP) { // FIXME: use addressofreturnaddress (but implement it in aarch64 backend // first). @@ -1251,7 +1251,7 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) { // Prepare ring buffer data. Value *PC = getPC(IRB); - Value *SP = getSP(IRB); + Value *SP = getFP(IRB); // Mix SP and PC. // Assumptions: diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index db05c63..9b6a39e 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -499,6 +499,8 @@ static Decomposition decompose(Value *V, if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64) return V; + bool IsKnownNonNegative = false; + // Decompose \p V used with a signed predicate. if (IsSigned) { if (auto *CI = dyn_cast<ConstantInt>(V)) { @@ -507,6 +509,14 @@ static Decomposition decompose(Value *V, } Value *Op0; Value *Op1; + + if (match(V, m_SExt(m_Value(Op0)))) + V = Op0; + else if (match(V, m_NNegZExt(m_Value(Op0)))) { + V = Op0; + IsKnownNonNegative = true; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) return MergeResults(Op0, Op1, IsSigned); @@ -529,7 +539,7 @@ static Decomposition decompose(Value *V, } } - return V; + return {V, IsKnownNonNegative}; } if (auto *CI = dyn_cast<ConstantInt>(V)) { @@ -539,7 +549,6 @@ static Decomposition decompose(Value *V, } Value *Op0; - bool IsKnownNonNegative = false; if (match(V, m_ZExt(m_Value(Op0)))) { IsKnownNonNegative = true; V = Op0; diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9235850..6ce9eb3 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -47,11 +47,6 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" -static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned( - "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden, - cl::desc("Enables canonicalization of signed relational predicates to " - "unsigned (e.g. sgt => ugt)")); - STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); @@ -90,6 +85,8 @@ STATISTIC(NumSaturating, "Number of saturating arithmetics converted to normal arithmetics"); STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null"); STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed"); +STATISTIC(NumSMinMax, + "Number of llvm.s{min,max} intrinsics simplified to unsigned"); STATISTIC(NumUDivURemsNarrowedExpanded, "Number of bound udiv's/urem's expanded"); STATISTIC(NumZExt, "Number of non-negative deductions"); @@ -289,9 +286,6 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, } static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { - if (!CanonicalizeICmpPredicatesToUnsigned) - return false; - // Only for signed relational comparisons of scalar integers. if (Cmp->getType()->isVectorTy() || !Cmp->getOperand(0)->getType()->isIntegerTy()) @@ -528,17 +522,40 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { } // See if this min/max intrinsic always picks it's one specific operand. +// If not, check whether we can canonicalize signed minmax into unsigned version static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) { CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate()); - LazyValueInfo::Tristate Result = LVI->getPredicateAt( - Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true); - if (Result == LazyValueInfo::Unknown) - return false; + ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0), + /*UndefAllowed*/ false); + ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1), + /*UndefAllowed*/ false); + if (LHS_CR.icmp(Pred, RHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getLHS()); + MM->eraseFromParent(); + return true; + } + if (RHS_CR.icmp(Pred, LHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getRHS()); + MM->eraseFromParent(); + return true; + } - ++NumMinMax; - MM->replaceAllUsesWith(MM->getOperand(!Result)); - MM->eraseFromParent(); - return true; + if (MM->isSigned() && + ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR, + RHS_CR)) { + ++NumSMinMax; + IRBuilder<> B(MM); + MM->replaceAllUsesWith(B.CreateBinaryIntrinsic( + MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin + : Intrinsic::umax, + MM->getLHS(), MM->getRHS())); + MM->eraseFromParent(); + return true; + } + + return false; } // Rewrite this with.overflow intrinsic as non-overflowing. diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 1bf50d7..851eab0 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -1221,6 +1221,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( Value::use_iterator I, E, Next; for (I = V->use_begin(), E = V->use_end(); I != E;) { Use &U = *I; + User *CurUser = U.getUser(); // Some users may see the same pointer operand in multiple operands. Skip // to the next instruction. @@ -1231,11 +1232,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. - U.set(NewV); + CurUser->replaceUsesOfWith(V, NewV); continue; } - User *CurUser = U.getUser(); // Skip if the current user is the new value itself. if (CurUser == NewV) continue; @@ -1311,10 +1311,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( while (isa<PHINode>(InsertPos)) ++InsertPos; - U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + // This instruction may contain multiple uses of V, update them all. + CurUser->replaceUsesOfWith( + V, new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); } else { - U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), - V->getType())); + CurUser->replaceUsesOfWith( + V, ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), + V->getType())); } } } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 627c863..08021f3b 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -7033,7 +7033,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // SCEVExpander for both use in preheader and latch const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); - SCEVExpanderCleaner ExpCleaner(Expander); assert(Expander.isSafeToExpand(TermValueS) && "Terminating value was checked safe in canFoldTerminatingCondition"); @@ -7064,10 +7063,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, BI->setCondition(NewTermCond); + Expander.clear(); OldTermCond->eraseFromParent(); DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); - - ExpCleaner.markResultUsed(); } } diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index f4f3070..260f31b 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -291,9 +291,9 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( InstructionCost TotalSpeculationCost = 0; unsigned NotHoistedInstCount = 0; for (const auto &I : FromBlock) { - // Make note of any DPValues that need hoisting. - for (DbgRecord &DR : I.getDbgValueRange()) { - DPValue &DPV = cast<DPValue>(DR); + // Make note of any DPValues that need hoisting. DPLabels + // get left behind just like llvm.dbg.labels. + for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { if (HasNoUnhoistedInstr(DPV.location_ops())) DPValuesToHoist[DPV.getInstruction()].push_back(&DPV); } diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 7fd6759..5bb109a 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -386,7 +386,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) { SmallVector<DPValue *, 8> ToBeRemoved; SmallDenseSet<DebugVariable> VariableSet; for (auto &I : reverse(*BB)) { - for (DPValue &DPV : reverse(DPValue::filter(I.getDbgValueRange()))) { + for (DbgRecord &DR : reverse(I.getDbgValueRange())) { + if (isa<DPLabel>(DR)) { + // Emulate existing behaviour (see comment below for dbg.declares). + // FIXME: Don't do this. + VariableSet.clear(); + continue; + } + + DPValue &DPV = cast<DPValue>(DR); // Skip declare-type records, as the debug intrinsic method only works // on dbg.value intrinsics. if (DPV.getType() == DPValue::LocationType::Declare) { diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 8ebcf0c..bab0651 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1585,8 +1585,30 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, return cast<DILocalVariable>(NewVar); }; - auto UpdateDPValuesOnInst = [&](Instruction &I) -> void { - for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { + auto UpdateDbgLabel = [&](auto *LabelRecord) { + // Point the label record to a fresh label within the new function if + // the record was not inlined from some other function. + if (LabelRecord->getDebugLoc().getInlinedAt()) + return; + DILabel *OldLabel = LabelRecord->getLabel(); + DINode *&NewLabel = RemappedMetadata[OldLabel]; + if (!NewLabel) { + DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram( + *OldLabel->getScope(), *NewSP, Ctx, Cache); + NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(), + OldLabel->getFile(), OldLabel->getLine()); + } + LabelRecord->setLabel(cast<DILabel>(NewLabel)); + }; + + auto UpdateDbgRecordsOnInst = [&](Instruction &I) -> void { + for (DbgRecord &DR : I.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) { + UpdateDbgLabel(DPL); + continue; + } + + DPValue &DPV = cast<DPValue>(DR); // Apply the two updates that dbg.values get: invalid operands, and // variable metadata fixup. if (any_of(DPV.location_ops(), IsInvalidLocation)) { @@ -1599,13 +1621,11 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, } if (!DPV.getDebugLoc().getInlinedAt()) DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable())); - DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(), - *NewSP, Ctx, Cache)); } }; for (Instruction &I : instructions(NewFunc)) { - UpdateDPValuesOnInst(I); + UpdateDbgRecordsOnInst(I); auto *DII = dyn_cast<DbgInfoIntrinsic>(&I); if (!DII) @@ -1614,17 +1634,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, // Point the intrinsic to a fresh label within the new function if the // intrinsic was not inlined from some other function. if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) { - if (DLI->getDebugLoc().getInlinedAt()) - continue; - DILabel *OldLabel = DLI->getLabel(); - DINode *&NewLabel = RemappedMetadata[OldLabel]; - if (!NewLabel) { - DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram( - *OldLabel->getScope(), *NewSP, Ctx, Cache); - NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(), - OldLabel->getFile(), OldLabel->getLine()); - } - DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel)); + UpdateDbgLabel(DLI); continue; } @@ -1658,6 +1668,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, if (const DebugLoc &DL = I.getDebugLoc()) I.setDebugLoc( DebugLoc::replaceInlinedAtSubprogram(DL, *NewSP, Ctx, Cache)); + for (DbgRecord &DR : I.getDbgValueRange()) + DR.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DR.getDebugLoc(), + *NewSP, Ctx, Cache)); // Loop info metadata may contain line locations. Fix them up. auto updateLoopInfoLoc = [&Ctx, &Cache, NewSP](Metadata *MD) -> Metadata * { diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 08fdd3b..2ff7c01 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -111,8 +111,7 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) { void StackInfoBuilder::visit(Instruction &Inst) { // Visit non-intrinsic debug-info records attached to Inst. - for (DbgRecord &DR : Inst.getDbgValueRange()) { - DPValue &DPV = cast<DPValue>(DR); + for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) { auto AddIfInteresting = [&](Value *V) { if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) { if (!isInterestingAlloca(*AI)) diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index 6e46469..91ab279 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -538,6 +538,11 @@ Value *Mapper::mapValue(const Value *V) { } void Mapper::remapDPValue(DbgRecord &DR) { + if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) { + DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel()))); + return; + } + DPValue &V = cast<DPValue>(DR); // Remap variables and DILocations. auto *MappedVar = mapMetadata(V.getVariable()); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4e33474..de4e56f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2422,18 +2422,25 @@ private: /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. Must form single-register vector. + /// \param ForOrder Tries to fetch the best candidates for ordering info. Also + /// commands to build the mask using the original vector value, without + /// relying on the potential reordering. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. std::optional<TargetTransformInfo::ShuffleKind> isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, - SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part); + SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, + bool ForOrder); /// Checks if the gathered \p VL can be represented as multi-register /// shuffle(s) of previous tree entries. /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. + /// \param ForOrder Tries to fetch the best candidates for ordering info. Also + /// commands to build the mask using the original vector value, without + /// relying on the potential reordering. /// \returns per-register series of ShuffleKind, if gathered values can be /// represented as shuffles of previous tree entries. \p Mask is filled with /// the shuffle mask (also on per-register base). @@ -2441,7 +2448,7 @@ private: isGatherShuffledEntry( const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, - unsigned NumParts); + unsigned NumParts, bool ForOrder = false); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask, std::optional<BoUpSLP::OrdersType> BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); - unsigned NumScalars = TE.Scalars.size(); + // Try to find subvector extract/insert patterns and reorder only such + // patterns. + SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); + Type *ScalarTy = GatheredScalars.front()->getType(); + int NumScalars = GatheredScalars.size(); + if (!isValidElementType(ScalarTy)) + return std::nullopt; + auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars); + int NumParts = TTI->getNumberOfParts(VecTy); + if (NumParts == 0 || NumParts >= NumScalars) + NumParts = 1; + SmallVector<int> ExtractMask; + SmallVector<int> Mask; + SmallVector<SmallVector<const TreeEntry *>> Entries; + SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles = + isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts, + /*ForOrder=*/true); + // No shuffled operands - ignore. + if (GatherShuffles.empty() && ExtractShuffles.empty()) + return std::nullopt; OrdersType CurrentOrder(NumScalars, NumScalars); - SmallVector<int> Positions; - SmallBitVector UsedPositions(NumScalars); - const TreeEntry *STE = nullptr; - // Try to find all gathered scalars that are gets vectorized in other - // vectorize node. Here we can have only one single tree vector node to - // correctly identify order of the gathered scalars. - for (unsigned I = 0; I < NumScalars; ++I) { - Value *V = TE.Scalars[I]; - if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) - continue; - if (const auto *LocalSTE = getTreeEntry(V)) { - if (!STE) - STE = LocalSTE; - else if (STE != LocalSTE) - // Take the order only from the single vector node. - return std::nullopt; - unsigned Lane = - std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); - if (Lane >= NumScalars) - return std::nullopt; - if (CurrentOrder[Lane] != NumScalars) { - if (Lane != I) + if (GatherShuffles.size() == 1 && + *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(TE.Scalars)) { + // Perfect match in the graph, will reuse the previously vectorized + // node. Cost is 0. + std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0); + return CurrentOrder; + } + auto IsSplatMask = [](ArrayRef<int> Mask) { + int SingleElt = PoisonMaskElem; + return all_of(Mask, [&](int I) { + if (SingleElt == PoisonMaskElem && I != PoisonMaskElem) + SingleElt = I; + return I == PoisonMaskElem || I == SingleElt; + }); + }; + // Exclusive broadcast mask - ignore. + if ((ExtractShuffles.empty() && IsSplatMask(Mask) && + (Entries.size() != 1 || + Entries.front().front()->ReorderIndices.empty())) || + (GatherShuffles.empty() && IsSplatMask(ExtractMask))) + return std::nullopt; + SmallBitVector ShuffledSubMasks(NumParts); + auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder, + ArrayRef<int> Mask, int PartSz, int NumParts, + function_ref<unsigned(unsigned)> GetVF) { + for (int I : seq<int>(0, NumParts)) { + if (ShuffledSubMasks.test(I)) + continue; + const int VF = GetVF(I); + if (VF == 0) + continue; + MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz); + // Shuffle of at least 2 vectors - ignore. + if (any_of(Slice, [&](int I) { return I != NumScalars; })) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); + continue; + } + // Try to include as much elements from the mask as possible. + int FirstMin = INT_MAX; + int SecondVecFound = false; + for (int K : seq<int>(0, PartSz)) { + int Idx = Mask[I * PartSz + K]; + if (Idx == PoisonMaskElem) { + Value *V = GatheredScalars[I * PartSz + K]; + if (isConstant(V) && !isa<PoisonValue>(V)) { + SecondVecFound = true; + break; + } continue; - UsedPositions.reset(CurrentOrder[Lane]); + } + if (Idx < VF) { + if (FirstMin > Idx) + FirstMin = Idx; + } else { + SecondVecFound = true; + break; + } } - // The partial identity (where only some elements of the gather node are - // in the identity order) is good. - CurrentOrder[Lane] = I; - UsedPositions.set(I); - } - } - // Need to keep the order if we have a vector entry and at least 2 scalars or - // the vectorized entry has just 2 scalars. - if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { - auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) { - for (unsigned I = 0; I < NumScalars; ++I) - if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) - return false; - return true; - }; - if (IsIdentityOrder(CurrentOrder)) - return OrdersType(); - auto *It = CurrentOrder.begin(); - for (unsigned I = 0; I < NumScalars;) { - if (UsedPositions.test(I)) { - ++I; + FirstMin = (FirstMin / PartSz) * PartSz; + // Shuffle of at least 2 vectors - ignore. + if (SecondVecFound) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); continue; } - if (*It == NumScalars) { - *It = I; - ++I; + for (int K : seq<int>(0, PartSz)) { + int Idx = Mask[I * PartSz + K]; + if (Idx == PoisonMaskElem) + continue; + Idx -= FirstMin; + if (Idx >= PartSz) { + SecondVecFound = true; + break; + } + if (CurrentOrder[I * PartSz + Idx] > + static_cast<unsigned>(I * PartSz + K) && + CurrentOrder[I * PartSz + Idx] != + static_cast<unsigned>(I * PartSz + Idx)) + CurrentOrder[I * PartSz + Idx] = I * PartSz + K; + } + // Shuffle of at least 2 vectors - ignore. + if (SecondVecFound) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); + continue; } - ++It; } - return std::move(CurrentOrder); + }; + int PartSz = NumScalars / NumParts; + if (!ExtractShuffles.empty()) + TransformMaskToOrder( + CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { + if (!ExtractShuffles[I]) + return 0U; + unsigned VF = 0; + for (unsigned Idx : seq<unsigned>(0, PartSz)) { + int K = I * PartSz + Idx; + if (ExtractMask[K] == PoisonMaskElem) + continue; + if (!TE.ReuseShuffleIndices.empty()) + K = TE.ReuseShuffleIndices[K]; + if (!TE.ReorderIndices.empty()) + K = std::distance(TE.ReorderIndices.begin(), + find(TE.ReorderIndices, K)); + auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]); + if (!EI) + continue; + VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType()) + ->getElementCount() + .getKnownMinValue()); + } + return VF; + }); + // Check special corner case - single shuffle of the same entry. + if (GatherShuffles.size() == 1 && NumParts != 1) { + if (ShuffledSubMasks.any()) + return std::nullopt; + PartSz = NumScalars; + NumParts = 1; } - return std::nullopt; + if (!Entries.empty()) + TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { + if (!GatherShuffles[I]) + return 0U; + return std::max(Entries[I].front()->getVectorFactor(), + Entries[I].back()->getVectorFactor()); + }); + int NumUndefs = + count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; }); + if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) + return std::nullopt; + return std::move(CurrentOrder); } namespace { @@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because // element 3 is used twice in the second submask. unsigned Sz = TE.Scalars.size(); - if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, - Sz)) + if (TE.State == TreeEntry::NeedToGather) { + if (std::optional<OrdersType> CurrentOrder = + findReusedOrderedScalars(TE)) { + SmallVector<int> Mask; + fixupOrderingIndices(*CurrentOrder); + inversePermutation(*CurrentOrder, Mask); + ::addMask(Mask, TE.ReuseShuffleIndices); + OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor()); + unsigned Sz = TE.Scalars.size(); + for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) { + for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz))) + if (Idx != PoisonMaskElem) + Res[Idx + K * Sz] = I + K * Sz; + } + return std::move(Res); + } + } + if (Sz == 2 && TE.getVectorFactor() == 4 && + TTI->getNumberOfParts(FixedVectorType::get( + TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1) return std::nullopt; + if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz)) { + SmallVector<int> ReorderMask(Sz, PoisonMaskElem); + if (TE.ReorderIndices.empty()) + std::iota(ReorderMask.begin(), ReorderMask.end(), 0); + else + inversePermutation(TE.ReorderIndices, ReorderMask); + ::addMask(ReorderMask, TE.ReuseShuffleIndices); + unsigned VF = ReorderMask.size(); + OrdersType ResOrder(VF, VF); + unsigned NumParts = VF / Sz; + SmallBitVector UsedVals(NumParts); + for (unsigned I = 0; I < VF; I += Sz) { + int Val = PoisonMaskElem; + unsigned UndefCnt = 0; + if (any_of(ArrayRef(ReorderMask).slice(I, Sz), + [&](int Idx) { + if (Val == PoisonMaskElem && Idx != PoisonMaskElem) + Val = Idx; + if (Idx == PoisonMaskElem) + ++UndefCnt; + return Idx != PoisonMaskElem && Idx != Val; + }) || + Val >= static_cast<int>(NumParts) || UsedVals.test(Val) || + UndefCnt > Sz / 2) + return std::nullopt; + UsedVals.set(Val); + for (unsigned K = 0; K < NumParts; ++K) + ResOrder[Val + Sz * K] = I + K; + } + return std::move(ResOrder); + } unsigned VF = TE.getVectorFactor(); // Try build correct order for extractelement instructions. SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), @@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); std::advance(It, Sz); } - if (all_of(enumerate(ResOrder), + if (TE.State == TreeEntry::NeedToGather && + all_of(enumerate(ResOrder), [](const auto &Data) { return Data.index() == Data.value(); })) return std::nullopt; // No need to reorder. return std::move(ResOrder); @@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { OrdersType CurrentOrder; bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, /*ResizeAllowed=*/true); - if (Reuse || !CurrentOrder.empty()) { - if (!CurrentOrder.empty()) - fixupOrderingIndices(CurrentOrder); + if (Reuse || !CurrentOrder.empty()) return std::move(CurrentOrder); - } } // If the gather node is <undef, v, .., poison> and // insertelement poison, v, 0 [+ permute] @@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { InstructionCost InsertIdxCost = TTI->getVectorInstrCost( Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, PoisonValue::get(Ty), *It); - if (InsertFirstCost + PermuteCost < InsertIdxCost) + if (InsertFirstCost + PermuteCost < InsertIdxCost) { + OrdersType Order(Sz, Sz); + Order[Idx] = 0; return std::move(Order); + } } } if (isSplat(TE.Scalars)) @@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { std::iota(It, std::next(It, Sz), 0); } +static void combineOrders(MutableArrayRef<unsigned> Order, + ArrayRef<unsigned> SecondaryOrder) { + assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && + "Expected same size of orders"); + unsigned Sz = Order.size(); + SmallBitVector UsedIndices(Sz); + for (unsigned Idx : seq<unsigned>(0, Sz)) { + if (Order[Idx] != Sz) + UsedIndices.set(Order[Idx]); + } + if (SecondaryOrder.empty()) { + for (unsigned Idx : seq<unsigned>(0, Sz)) + if (Order[Idx] == Sz && !UsedIndices.test(Idx)) + Order[Idx] = Idx; + } else { + for (unsigned Idx : seq<unsigned>(0, Sz)) + if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz && + !UsedIndices.test(SecondaryOrder[Idx])) + Order[Idx] = SecondaryOrder[Idx]; + } +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; @@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() { } if (OrdersUses.empty()) continue; + auto IsIdentityOrder = [](ArrayRef<unsigned> Order) { + const unsigned Sz = Order.size(); + for (unsigned Idx : seq<unsigned>(0, Sz)) + if (Idx != Order[Idx] && Order[Idx] != Sz) + return false; + return true; + }; // Choose the most used order. - ArrayRef<unsigned> BestOrder = OrdersUses.front().first; - unsigned Cnt = OrdersUses.front().second; - for (const auto &Pair : drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + unsigned IdentityCnt = 0; + unsigned FilledIdentityCnt = 0; + OrdersType IdentityOrder(VF, VF); + for (auto &Pair : OrdersUses) { + if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + if (!Pair.first.empty()) + FilledIdentityCnt += Pair.second; + IdentityCnt += Pair.second; + combineOrders(IdentityOrder, Pair.first); + } + } + MutableArrayRef<unsigned> BestOrder = IdentityOrder; + unsigned Cnt = IdentityCnt; + for (auto &Pair : OrdersUses) { + // Prefer identity order. But, if filled identity found (non-empty order) + // with same number of uses, as the new candidate order, we can choose + // this candidate order. + if (Cnt < Pair.second || + (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && + Cnt == Pair.second && !BestOrder.empty() && + IsIdentityOrder(BestOrder))) { + combineOrders(Pair.first, BestOrder); BestOrder = Pair.first; Cnt = Pair.second; + } else { + combineOrders(BestOrder, Pair.first); } } // Set order of the user node. - if (BestOrder.empty()) + if (IsIdentityOrder(BestOrder)) continue; + fixupOrderingIndices(BestOrder); SmallVector<int> Mask; inversePermutation(BestOrder, Mask); SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); @@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands( void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SetVector<TreeEntry *> OrderedEntries; - DenseMap<const TreeEntry *, OrdersType> GathersToOrders; + DenseSet<const TreeEntry *> GathersToOrders; // Find all reorderable leaf nodes with the given VF. // Currently the are vectorized loads,extracts without alternate operands + // some gathering of extracts. @@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) - GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + GathersToOrders.insert(TE.get()); } } @@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || (TE->State == TreeEntry::NeedToGather && - GathersToOrders.count(TE))) || + GathersToOrders.contains(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || !all_of(drop_begin(TE->UserTreeIndices), [TE](const EdgeInfo &EI) { @@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const auto Order = [&]() -> const OrdersType { if (OpTE->State == TreeEntry::NeedToGather || !OpTE->ReuseShuffleIndices.empty()) - return GathersToOrders.find(OpTE)->second; + return getReorderingData(*OpTE, /*TopToBottom=*/false) + .value_or(OrdersType(1)); return OpTE->ReorderIndices; }(); + // The order is partially ordered, skip it in favor of fully non-ordered + // orders. + if (Order.size() == 1) + continue; unsigned NumOps = count_if( Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { return P.second == OpTE; @@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { (IgnoreReorder && TE->Idx == 0)) return true; if (TE->State == TreeEntry::NeedToGather) { - auto It = GathersToOrders.find(TE); - if (It != GathersToOrders.end()) - return !It->second.empty(); + if (GathersToOrders.contains(TE)) + return !getReorderingData(*TE, /*TopToBottom=*/false) + .value_or(OrdersType(1)) + .empty(); return true; } return false; @@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { ++Res.first->second; } } - // Choose the best order. - ArrayRef<unsigned> BestOrder = OrdersUses.front().first; - unsigned Cnt = OrdersUses.front().second; - for (const auto &Pair : drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + if (OrdersUses.empty()) { + for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) + OrderedEntries.remove(Op.second); + continue; + } + auto IsIdentityOrder = [](ArrayRef<unsigned> Order) { + const unsigned Sz = Order.size(); + for (unsigned Idx : seq<unsigned>(0, Sz)) + if (Idx != Order[Idx] && Order[Idx] != Sz) + return false; + return true; + }; + // Choose the most used order. + unsigned IdentityCnt = 0; + unsigned VF = Data.second.front().second->getVectorFactor(); + OrdersType IdentityOrder(VF, VF); + for (auto &Pair : OrdersUses) { + if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + IdentityCnt += Pair.second; + combineOrders(IdentityOrder, Pair.first); + } + } + MutableArrayRef<unsigned> BestOrder = IdentityOrder; + unsigned Cnt = IdentityCnt; + for (auto &Pair : OrdersUses) { + // Prefer identity order. But, if filled identity found (non-empty + // order) with same number of uses, as the new candidate order, we can + // choose this candidate order. + if (Cnt < Pair.second) { + combineOrders(Pair.first, BestOrder); BestOrder = Pair.first; Cnt = Pair.second; + } else { + combineOrders(BestOrder, Pair.first); } } - // Set order of the user node (reordering of operands and user nodes). - if (BestOrder.empty()) { + // Set order of the user node. + if (IsIdentityOrder(BestOrder)) { for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) OrderedEntries.remove(Op.second); continue; } + fixupOrderingIndices(BestOrder); // Erase operands from OrderedEntries list and adjust their orders. VisitedOps.clear(); SmallVector<int> Mask; @@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } V1 = Constant::getNullValue( FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + // Not identity/broadcast? Try to see if the original vector is better. + if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && + CommonVF == CommonMask.size() && + any_of(enumerate(CommonMask), + [](const auto &&P) { + return P.value() != PoisonMaskElem && + static_cast<unsigned>(P.value()) != P.index(); + }) && + any_of(CommonMask, + [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) { + SmallVector<int> ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + ::addMask(CommonMask, ReorderMask); + } } else if (V1 && P2.isNull()) { // Shuffle single vector. CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements(); @@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, std::optional<TargetTransformInfo::ShuffleKind> BoUpSLP::isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, - SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) { + SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) { Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. @@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( VToTEs.insert(TEPtr); } if (const TreeEntry *VTE = getTreeEntry(V)) { + if (ForOrder) { + if (VTE->State != TreeEntry::Vectorize) { + auto It = MultiNodeScalars.find(V); + if (It == MultiNodeScalars.end()) + continue; + VTE = *It->getSecond().begin(); + // Iterate through all vectorized nodes. + auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) { + return MTE->State == TreeEntry::Vectorize; + }); + if (MIt == It->getSecond().end()) + continue; + VTE = *MIt; + } + } Instruction &LastBundleInst = getLastInstructionInBundle(VTE); if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) continue; @@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // scalar in the list. for (const std::pair<unsigned, int> &Pair : EntryLanes) { unsigned Idx = Part * VL.size() + Pair.second; - Mask[Idx] = Pair.first * VF + - Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + Mask[Idx] = + Pair.first * VF + + (ForOrder ? std::distance( + Entries[Pair.first]->Scalars.begin(), + find(Entries[Pair.first]->Scalars, VL[Pair.second])) + : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); IsIdentity &= Mask[Idx] == Pair.second; } switch (Entries.size()) { @@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> BoUpSLP::isGatherShuffledEntry( const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, - SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, - unsigned NumParts) { + SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts, + bool ForOrder) { assert(NumParts > 0 && NumParts < VL.size() && "Expected positive number of registers."); Entries.clear(); @@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry( ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize); SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back(); std::optional<TTI::ShuffleKind> SubRes = - isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part); + isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part, + ForOrder); if (!SubRes) SubEntries.clear(); Res.push_back(SubRes); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 240d4bd..a2a203c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -385,9 +385,6 @@ struct VPTransformState { VPValue2ValueTy VPValue2Value; - /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). - Value *CanonicalIV = nullptr; - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; |