aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h9
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h24
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
10 files changed, 100 insertions, 134 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..4b22c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
+#include <array>
namespace llvm {
@@ -45,7 +46,7 @@ struct GCNRegPressure {
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
}
- void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ void clear() { Value.fill(0); }
unsigned getNumRegs(RegKind Kind) const {
assert(Kind < TOTAL_KINDS);
@@ -127,9 +128,7 @@ struct GCNRegPressure {
bool less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
- bool operator==(const GCNRegPressure &O) const {
- return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
- }
+ bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
@@ -160,7 +159,7 @@ private:
/// Pressure for all register kinds (first all regular registers kinds, then
/// all tuple register kinds).
- unsigned Value[ValueArraySize];
+ std::array<unsigned, ValueArraySize> Value;
static unsigned getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI);
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..09ef6ac 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
+ // 32-bit ABS is legal for AMDGPU except for R600
+ setOperationAction(ISD::ABS, MVT::i32, Expand);
+
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 942e784..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
- const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
- this]() -> bool {
- if (CmpValue != 0)
- return false;
-
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
- return false;
-
- bool CanOptimize = false;
-
- // For S_OP that set SCC = DST!=0, do the transformation
- //
- // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
- if (setsSCCifResultIsNonZero(*Def))
- CanOptimize = true;
-
- // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
- // the same value that will be calculated by s_cmp_lg_*
- //
- // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
- // imm), 0)
- if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
- bool Op1IsNonZeroImm =
- Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
- bool Op2IsZeroImm =
- Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
- if (Op1IsNonZeroImm && Op2IsZeroImm)
- CanOptimize = true;
- }
-
- if (!CanOptimize)
- return false;
-
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
-
- if (MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
- return true;
- };
-
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
+ I->killsRegister(AMDGPU::SCC, &RI))
return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10810,7 +10753,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 32, true, false);
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10818,7 +10761,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 64, true, false);
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ee99a74..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -709,30 +709,6 @@ public:
}
}
- static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
- if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- return false;
- // Compares have no result
- if (MI.isCompare())
- return false;
- switch (MI.getOpcode()) {
- default:
- return true;
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_U32:
- case AMDGPU::S_ADDC_U32:
- case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUBB_U32:
- case AMDGPU::S_MIN_I32:
- case AMDGPU::S_MIN_U32:
- case AMDGPU::S_MAX_I32:
- case AMDGPU::S_MAX_U32:
- case AMDGPU::S_ADDK_I32:
- return false;
- }
- }
-
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b80c3c9..4947d03 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
@@ -760,6 +761,7 @@ private:
void handleCallArguments(CallBase &CB);
void handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO, unsigned Idx);
+ bool isInstFullyOverDefined(Instruction &Inst);
private:
friend class InstVisitor<SCCPInstVisitor>;
@@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
void SCCPInstVisitor::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
return (void)markOverdefined(&PN);
- unsigned NumActiveIncoming = 0;
+ if (isInstFullyOverDefined(PN))
+ return;
+ SmallVector<unsigned> FeasibleIncomingIndices;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+ FeasibleIncomingIndices.push_back(i);
+ }
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is a constant
// range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
+ if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement PhiState = getStructValueState(&PN, i);
+ if (PhiState.isOverdefined())
+ continue;
+ for (unsigned j : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV =
+ getStructValueState(PN.getIncomingValue(j), i);
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
+ }
+ } else {
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
}
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- ValueLatticeElement &PhiStateRef = ValueState[&PN];
- mergeInValue(PhiStateRef, &PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
}
}
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+ // For structure Type, we handle each member separately.
+ // A structure object won't be considered as overdefined when
+ // there is at least one member that is not overdefined.
+ if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+ if (!getStructValueState(&Inst, i).isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+ return getValueState(&Inst).isOverdefined();
+}
+
void SCCPInstVisitor::solve() {
// Process the work lists until they are empty!
while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..84d2ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -4075,6 +4079,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..7e074c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..d1e67e6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..f5f616f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..9a2497e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.