diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 69 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 24 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SCCPSolver.cpp | 96 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 12 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanUtils.h | 2 |
10 files changed, 100 insertions, 134 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 979a8b0..4b22c68 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include <algorithm> +#include <array> namespace llvm { @@ -45,7 +46,7 @@ struct GCNRegPressure { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; } - void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + void clear() { Value.fill(0); } unsigned getNumRegs(RegKind Kind) const { assert(Kind < TOTAL_KINDS); @@ -127,9 +128,7 @@ struct GCNRegPressure { bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; - bool operator==(const GCNRegPressure &O) const { - return std::equal(&Value[0], &Value[ValueArraySize], O.Value); - } + bool operator==(const GCNRegPressure &O) const { return Value == O.Value; } bool operator!=(const GCNRegPressure &O) const { return !(*this == O); @@ -160,7 +159,7 @@ private: /// Pressure for all register kinds (first all regular registers kinds, then /// all tuple register kinds). - unsigned Value[ValueArraySize]; + std::array<unsigned, ValueArraySize> Value; static unsigned getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..09ef6ac 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); + // 32-bit ABS is legal for AMDGPU except for R600 + setOperationAction(ISD::ABS, MVT::i32, Expand); + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 942e784..50447f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; - const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, - this]() -> bool { - if (CmpValue != 0) - return false; - - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) - return false; - - bool CanOptimize = false; - - // For S_OP that set SCC = DST!=0, do the transformation - // - // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) - if (setsSCCifResultIsNonZero(*Def)) - CanOptimize = true; - - // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has - // the same value that will be calculated by s_cmp_lg_* - // - // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero - // imm), 0) - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - CanOptimize = true; - } - - if (!CanOptimize) - return false; - - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); - return true; - }; - const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { @@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); + I != E; ++I) { + if (I->modifiesRegister(AMDGPU::SCC, &RI) || + I->killsRegister(AMDGPU::SCC, &RI)) return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; } MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); CmpInstr.eraseFromParent(); if (!MRI->use_nodbg_empty(DefReg)) { @@ -10810,7 +10753,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 32, true, false); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10818,7 +10761,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 64, true, false); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ee99a74..df27ec1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -709,30 +709,6 @@ public: } } - static bool setsSCCifResultIsNonZero(const MachineInstr &MI) { - if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - return false; - // Compares have no result - if (MI.isCompare()) - return false; - switch (MI.getOpcode()) { - default: - return true; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: - case AMDGPU::S_ADDC_U32: - case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: - case AMDGPU::S_SUBB_U32: - case AMDGPU::S_MIN_I32: - case AMDGPU::S_MIN_U32: - case AMDGPU::S_MAX_I32: - case AMDGPU::S_MAX_U32: - case AMDGPU::S_ADDK_I32: - return false; - } - } - static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b80c3c9..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -760,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - ValueLatticeElement &PhiStateRef = ValueState[&PN]; - mergeInValue(PhiStateRef, &PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0b042..84d2ea6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -407,6 +407,10 @@ public: VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } + /// \return the VPRegionBlock which the recipe belongs to. + VPRegionBlock *getRegion(); + const VPRegionBlock *getRegion() const; + /// The method which generates the output IR instructions that correspond to /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; @@ -4075,6 +4079,14 @@ public: } }; +inline VPRegionBlock *VPRecipeBase::getRegion() { + return getParent()->getParent(); +} + +inline const VPRegionBlock *VPRecipeBase::getRegion() const { + return getParent()->getParent(); +} + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index f413c63..7e074c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + VPRegionBlock *Region = R->getRegion(); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a98c75..d1e67e6b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getParent()->getParent()->getCanonicalIV(); + auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr, State.AC->registerAssumption(II); assert( - (RepRecipe->getParent()->getParent() || + (RepRecipe->getRegion() || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && @@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, to_vector(operands()), VF); // If the recipe is not predicated (i.e. not in a replicate region), return // the scalar cost. Otherwise handle predicated cost. - if (!getParent()->getParent()->isReplicator()) + if (!getRegion()->isReplicator()) return ScalarCost; // Account for the phi nodes that we will create. @@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::Store: { // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); + const VPRegionBlock *ParentRegion = getRegion(); if (ParentRegion && ParentRegion->isReplicator()) break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cae9aee8..f5f616f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; VPRegionBlock *EnclosingLoopRegion = HoistCandidate->getParent()->getEnclosingLoopRegion(); - assert((!HoistCandidate->getParent()->getParent() || - HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + assert((!HoistCandidate->getRegion() || + HoistCandidate->getRegion() == EnclosingLoopRegion) && "CFG in VPlan should still be flat, without replicate regions"); // Hoist candidate was already visited, no need to hoist. if (!Visited.insert(HoistCandidate).second) @@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides( // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || + return R->getRegion() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; ValueToSCEVMapTy RewriteMap; @@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; }; if ((isa<VPReplicateRecipe>(DefR) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index cf95ac0..9a2497e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) { - const VPRegionBlock *RegionOfR = Rep->getParent()->getParent(); + const VPRegionBlock *RegionOfR = Rep->getRegion(); // Don't consider recipes in replicate regions as uniform yet; their first // lane cannot be accessed when executing the replicate region for other // lanes. |