diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 39 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 55 |
5 files changed, 59 insertions, 51 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 5630580..82fc240 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -367,19 +367,6 @@ def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> { let BaseClassOrder = 10000; } -def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> { - let CopyCost = 1; - let Size = 16; - let isAllocatable = 0; - let HasSGPR = 1; -} - // TODO: Do we need to set DwarfRegAlias on register tuples? def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, @@ -774,12 +761,6 @@ def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32, let BaseClassOrder = 10000; } -def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, - (add LDS_DIRECT)> { - let isAllocatable = 0; - let CopyCost = -1; -} - let GeneratePressureSet = 0, HasSGPR = 1 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. @@ -797,7 +778,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16, SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16, - EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16, + EXEC_LO_LO16, EXEC_HI_LO16, M0_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> { let Size = 16; let isAllocatable = 0; @@ -805,7 +786,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, } def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, - (add SReg_32_XM0_XEXEC, M0_CLASS)> { + (add SReg_32_XM0_XEXEC, M0)> { let AllocationPriority = 0; } @@ -830,7 +811,7 @@ def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32, // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, - (add SReg_32_XM0, M0_CLASS)> { + (add SReg_32_XM0, M0)> { let AllocationPriority = 0; let HasSGPR = 1; let BaseClassOrder = 32; @@ -842,7 +823,7 @@ def SGPR_NULL256 : SIReg<"null">; let GeneratePressureSet = 0 in { def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add SReg_32, LDS_DIRECT_CLASS)> { + (add SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasSGPR = 1; let Size = 32; @@ -981,7 +962,7 @@ defm "" : SRegClass<32, Reg1024Types.types, SGPR_1024Regs>; } def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32, LDS_DIRECT_CLASS)> { + (add VGPR_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 32; @@ -1096,21 +1077,21 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> { } def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, - (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_16, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 16; } def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, - (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_16_Lo128, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 16; } def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; @@ -1118,7 +1099,7 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v } def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32_Lo128, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; @@ -1126,7 +1107,7 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2 } def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32_Lo256, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index efdbd12..447f05c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1417,9 +1417,9 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode > let Predicates = [HasVendorXqciac, IsRV32] in { def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; -def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)), +def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)), +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index cba282c..a2e8c69 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -15,13 +15,12 @@ using namespace llvm; -namespace { /// Return true if and only if the given instruction does not modify the memory /// location referenced. Note that an idemptent atomicrmw may still have /// ordering effects on nearby instructions, or be volatile. /// TODO: Common w/ the version in AtomicExpandPass, and change the term used. /// Idemptotent is confusing in this context. -bool isIdempotentRMW(AtomicRMWInst& RMWI) { +static bool isIdempotentRMW(AtomicRMWInst &RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) switch(RMWI.getOperation()) { case AtomicRMWInst::FAdd: // -0.0 @@ -59,7 +58,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) { /// Return true if the given instruction always produces a value in memory /// equivalent to its value operand. -bool isSaturating(AtomicRMWInst& RMWI) { +static bool isSaturating(AtomicRMWInst &RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) switch (RMWI.getOperation()) { case AtomicRMWInst::FMax: @@ -98,7 +97,6 @@ bool isSaturating(AtomicRMWInst& RMWI) { return C->isMaxValue(false); }; } -} // namespace Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 917004c..048cdf4 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -132,8 +132,6 @@ STATISTIC(NumReassoc , "Number of reassociations"); DEBUG_COUNTER(VisitCounter, "instcombine-visit", "Controls which instructions are visited"); -namespace llvm { - static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); @@ -146,7 +144,9 @@ static cl::opt<unsigned> MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm // FIXME: Remove this flag when it is no longer necessary to convert // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false @@ -158,8 +158,6 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes; static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", cl::Hidden, cl::init(true)); -} // end namespace llvm - std::optional<Instruction *> InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) { // Handle target specific intrinsics diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 8bba634..48055ad 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5152,14 +5152,18 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (ExtraCase && Values.size() < 2) return false; - // TODO: Preserve branch weight metadata, similarly to how - // foldValueComparisonIntoPredecessors preserves it. + SmallVector<uint32_t> BranchWeights; + const bool HasProfile = !ProfcheckDisableMetadataFixes && + extractBranchWeights(*BI, BranchWeights); // Figure out which block is which destination. BasicBlock *DefaultBB = BI->getSuccessor(1); BasicBlock *EdgeBB = BI->getSuccessor(0); - if (!TrueWhenEqual) + if (!TrueWhenEqual) { std::swap(DefaultBB, EdgeBB); + if (HasProfile) + std::swap(BranchWeights[0], BranchWeights[1]); + } BasicBlock *BB = BI->getParent(); @@ -5190,10 +5194,11 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (!isGuaranteedNotToBeUndefOrPoison(ExtraCase, AC, BI, nullptr)) ExtraCase = Builder.CreateFreeze(ExtraCase); - if (TrueWhenEqual) - Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB); - else - Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); + // We don't have any info about this condition. + auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) + : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), + DEBUG_TYPE); OldTI->eraseFromParent(); @@ -5220,6 +5225,17 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // Create the new switch instruction now. SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + if (HasProfile) { + // We know the weight of the default case. We don't know the weight of the + // other cases, but rather than completely lose profiling info, we split + // the remaining probability equally over them. + SmallVector<uint32_t> NewWeights(Values.size() + 1); + NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if + // TrueWhenEqual. + for (auto &V : drop_begin(NewWeights)) + V = BranchWeights[0] / Values.size(); + setBranchWeights(*New, NewWeights, /*IsExpected=*/false); + } // Add all of the 'cases' to the switch instruction. for (ConstantInt *Val : Values) @@ -7211,6 +7227,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); BranchInst *RangeCheckBranch = nullptr; + BranchInst *CondBranch = nullptr; Builder.SetInsertPoint(SI); const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); @@ -7225,6 +7242,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize)); RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + CondBranch = RangeCheckBranch; if (DTU) Updates.push_back({DominatorTree::Insert, BB, LookupBB}); } @@ -7263,7 +7281,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted"); Value *LoBit = Builder.CreateTrunc( Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit"); - Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); + CondBranch = Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); if (DTU) { Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB}); Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()}); @@ -7303,19 +7321,32 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, if (DTU) Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest}); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = CondBranch && !ProfcheckDisableMetadataFixes && + extractBranchWeights(*SI, BranchWeights); + uint64_t ToLookupWeight = 0; + uint64_t ToDefaultWeight = 0; + // Remove the switch. SmallPtrSet<BasicBlock *, 8> RemovedSuccessors; - for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { - BasicBlock *Succ = SI->getSuccessor(i); + for (unsigned I = 0, E = SI->getNumSuccessors(); I < E; ++I) { + BasicBlock *Succ = SI->getSuccessor(I); - if (Succ == SI->getDefaultDest()) + if (Succ == SI->getDefaultDest()) { + if (HasBranchWeights) + ToDefaultWeight += BranchWeights[I]; continue; + } Succ->removePredecessor(BB); if (DTU && RemovedSuccessors.insert(Succ).second) Updates.push_back({DominatorTree::Delete, BB, Succ}); + if (HasBranchWeights) + ToLookupWeight += BranchWeights[I]; } SI->eraseFromParent(); - + if (HasBranchWeights) + setFittedBranchWeights(*CondBranch, {ToLookupWeight, ToDefaultWeight}, + /*IsExpected=*/false); if (DTU) DTU->applyUpdates(Updates); |