diff options
Diffstat (limited to 'llvm/lib')
36 files changed, 390 insertions, 310 deletions
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 3b3e7a4..a7c99b1 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2083,22 +2083,54 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (TBB == FBB) { MBB->splice(Loc, TBB, TBB->begin(), TIB); } else { + // Merge the debug locations, and hoist and kill the debug instructions from + // both branches. FIXME: We could probably try harder to preserve some debug + // instructions (but at least this isn't producing wrong locations). + MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc); + auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) { + assert(DI->isDebugInstr() && "Expected a debug instruction"); + if (DI->isDebugRef()) { + const TargetInstrInfo *TII = + MBB->getParent()->getSubtarget().getInstrInfo(); + const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE); + DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0, + DI->getDebugVariable(), DI->getDebugExpression()); + MBB->insert(Loc, &*DI); + return; + } + // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF. + if (DI->isDebugPHI()) { + DI->eraseFromParent(); + return; + } + + DI->setDebugValueUndef(); + DI->moveBefore(&*Loc); + }; + // TIB and FIB point to the end of the regions to hoist/merge in TBB and // FBB. MachineBasicBlock::iterator FE = FIB; MachineBasicBlock::iterator FI = FBB->begin(); for (MachineBasicBlock::iterator TI : make_early_inc_range(make_range(TBB->begin(), TIB))) { - // Move debug instructions and pseudo probes without modifying them. - // FIXME: This is the wrong thing to do for debug locations, which - // should at least be killed (and hoisted from BOTH blocks). - if (TI->isDebugOrPseudoInstr()) { - TI->moveBefore(&*Loc); + // Hoist and kill debug instructions from FBB. After this loop FI points + // to the next non-debug instruction to hoist (checked in assert after the + // TBB debug instruction handling code). + while (FI != FE && FI->isDebugInstr()) + HoistAndKillDbgInstr(FI++); + + // Kill debug instructions before moving. + if (TI->isDebugInstr()) { + HoistAndKillDbgInstr(TI); continue; } - // Get the next non-meta instruction in FBB. - FI = skipDebugInstructionsForward(FI, FE, false); + // FI and TI now point to identical non-debug instructions. + assert(FI != FE && "Unexpected end of FBB range"); + // Pseudo probes are excluded from the range when identifying foldable + // instructions, so we don't expect to see one now. + assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range"); // NOTE: The loop above checks CheckKillDead but we can't do that here as // it modifies some kill markers after the check. assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) && @@ -2111,6 +2143,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { ++FI; } } + FBB->erase(FBB->begin(), FIB); if (UpdateLiveIns) diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 429a17a..7ea2512 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -211,8 +211,7 @@ void MachineFunction::init() { ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); - // FIXME: Use Function::hasOptSize(). - if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.getAlign() && !F.hasOptSize()) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index d4a3455..68b8a00 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() { ISD::SDIVFIX, ISD::SDIVFIXSAT, ISD::UDIVFIX, ISD::UDIVFIXSAT, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, - ISD::IS_FPCLASS}, + ISD::IS_FPCLASS, ISD::FCBRT, + ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::FEXP, + ISD::FEXP2, ISD::FEXP10, + ISD::FFLOOR, ISD::FNEARBYINT, + ISD::FCEIL, ISD::FRINT, + ISD::FTRUNC, ISD::FROUNDEVEN, + ISD::FTAN, ISD::FACOS, + ISD::FASIN, ISD::FATAN, + ISD::FCOSH, ISD::FSINH, + ISD::FTANH, ISD::FATAN2}, VT, Expand); // Overflow operations default to expand @@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() { // These operations default to expand for vector types. if (VT.isVector()) - setOperationAction( - {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG, - ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, - ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND, - ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, - ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2}, - VT, Expand); + setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, + ISD::ANY_EXTEND_VECTOR_INREG, + ISD::SIGN_EXTEND_VECTOR_INREG, + ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR, + ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND}, + VT, Expand); // Constrained floating-point operations default to expand. #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ @@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() { {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128}, Expand); - // These library functions default to expand. - setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, - ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, - ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, - ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN, - ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH, - ISD::FATAN2}, - {MVT::f32, MVT::f64, MVT::f128}, Expand); - // Insert custom handling default for llvm.canonicalize.*. setOperationAction(ISD::FCANONICALIZE, {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand); diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 408d07b..725e951 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1734,7 +1734,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal( Name == getInstrProfSectionName(IPSK_covdata, Triple::COFF, /*AddSegmentInfo=*/false) || Name == getInstrProfSectionName(IPSK_covname, Triple::COFF, - /*AddSegmentInfo=*/false)) + /*AddSegmentInfo=*/false) || + Name == ".llvmbc" || Name == ".llvmcmd") Kind = SectionKind::getMetadata(); int Selection = 0; unsigned Characteristics = getCOFFSectionFlags(Kind, TM); diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 2579fa3..0f19495 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -8,11 +8,11 @@ #include "llvm/Object/IRSymtab.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Comdat.h" @@ -213,9 +213,10 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) { - DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); +static StringSet<> buildPreservedSymbolsSet(const Triple &TT) { + StringSet<> PreservedSymbolSet; + PreservedSymbolSet.insert(std::begin(PreservedSymbols), + std::end(PreservedSymbols)); // FIXME: Do we need to pass in ABI fields from TargetOptions? RTLIB::RuntimeLibcallsInfo Libcalls(TT); for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { @@ -280,7 +281,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, setStr(Sym.IRName, GV->getName()); - static const DenseSet<StringRef> PreservedSymbolsSet = + static const StringSet<> PreservedSymbolsSet = buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp index 5bb04d0..b6f338f 100644 --- a/llvm/lib/Support/Debug.cpp +++ b/llvm/lib/Support/Debug.cpp @@ -24,11 +24,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Debug.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/circular_raw_ostream.h" #include "llvm/Support/raw_ostream.h" +#include <utility> #include "DebugOptions.h" @@ -38,27 +40,62 @@ using namespace llvm; +/// Parse a debug type string into a pair of the debug type and the debug level. +/// The expected format is "type[:level]", where the level is an optional +/// integer. +static std::pair<std::string, std::optional<int>> +parseDebugType(StringRef DbgType) { + std::optional<int> Level; + size_t ColonPos = DbgType.find(':'); + if (ColonPos != StringRef::npos) { + StringRef LevelStr = DbgType.substr(ColonPos + 1); + DbgType = DbgType.take_front(ColonPos); + if (LevelStr.empty()) + Level = 0; + else { + int parsedLevel; + if (to_integer(LevelStr, parsedLevel, 10)) + Level = parsedLevel; + } + } + return std::make_pair(DbgType.str(), Level); +} + // Even though LLVM might be built with NDEBUG, define symbols that the code // built without NDEBUG can depend on via the llvm/Support/Debug.h header. namespace llvm { /// Exported boolean set by the -debug option. bool DebugFlag = false; -static ManagedStatic<std::vector<std::string>> CurrentDebugType; +/// The current debug type and an optional debug level. +/// The debug level is the verbosity of the debug output. +/// 0 is a special level that acts as an opt-out for this specific debug type. +/// If provided, the debug output is enabled only if the user specified a level +/// at least as high as the provided level. +static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>> + CurrentDebugType; /// Return true if the specified string is the debug type /// specified on the command line, or if none was specified on the command line /// with the -debug-only=X option. -bool isCurrentDebugType(const char *DebugType) { +bool isCurrentDebugType(const char *DebugType, int Level) { if (CurrentDebugType->empty()) return true; + // Track if there is at least one debug type with a level, this is used + // to allow to opt-out of some DebugType and leaving all the others enabled. + bool HasEnabledDebugType = false; // See if DebugType is in list. Note: do not use find() as that forces us to // unnecessarily create an std::string instance. - for (auto &d : *CurrentDebugType) { - if (d == DebugType) + for (auto &D : *CurrentDebugType) { + HasEnabledDebugType = + HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0); + if (D.first != DebugType) + continue; + if (!D.second.has_value()) return true; + return D.second >= Level; } - return false; + return !HasEnabledDebugType; } /// Set the current debug type, as if the -debug-only=X @@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) { void setCurrentDebugTypes(const char **Types, unsigned Count) { CurrentDebugType->clear(); - llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count)); + CurrentDebugType->reserve(Count); + for (const char *Type : ArrayRef(Types, Count)) + CurrentDebugType->push_back(parseDebugType(Type)); } + } // namespace llvm // All Debug.h functionality is a no-op in NDEBUG mode. @@ -114,10 +154,10 @@ struct DebugOnlyOpt { if (Val.empty()) return; DebugFlag = true; - SmallVector<StringRef,8> dbgTypes; - StringRef(Val).split(dbgTypes, ',', -1, false); - for (auto dbgType : dbgTypes) - CurrentDebugType->push_back(std::string(dbgType)); + SmallVector<StringRef, 8> DbgTypes; + StringRef(Val).split(DbgTypes, ',', -1, false); + for (auto DbgType : DbgTypes) + CurrentDebugType->push_back(parseDebugType(DbgType)); } }; } // namespace @@ -129,8 +169,13 @@ struct CreateDebugOnly { static void *call() { return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>( "debug-only", - cl::desc("Enable a specific type of debug output (comma separated list " - "of types)"), + cl::desc( + "Enable a specific type of debug output (comma separated list " + "of types using the format \"type[:level]\", where the level " + "is an optional integer. The level can be set to 1, 2, 3, etc. to " + "control the verbosity of the output. Setting a debug-type level " + "to zero acts as an opt-out for this specific debug-type without " + "affecting the others."), cl::Hidden, cl::value_desc("debug string"), cl::location(DebugOnlyOptLoc), cl::ValueRequired); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 49d8b44..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e..f25ce87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, - MVT::f32, Legal); + ISD::FROUNDEVEN, ISD::FTRUNC}, + {MVT::f16, MVT::f32}, Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); @@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else { + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + } else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 9a2bab1..0a0a107 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return getMaxNumVGPRs(MF.getFunction()); } +std::pair<unsigned, unsigned> +GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { + const unsigned MaxVectorRegs = getMaxNumVGPRs(F); + + unsigned MaxNumVGPRs = MaxVectorRegs; + unsigned MaxNumAGPRs = 0; + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (hasGFX90AInsts()) { + unsigned MinNumAGPRs = 0; + const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); + const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; + + // TODO: The lower bound should probably force the number of required + // registers up, overriding amdgpu-waves-per-eu. + std::tie(MinNumAGPRs, MaxNumAGPRs) = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR, + /*OnlyFirstRequired=*/true); + + if (MinNumAGPRs == DefaultNumAGPR.first) { + // Default to splitting half the registers if AGPRs are required. + MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; + } else { + // Align to accum_offset's allocation granularity. + MinNumAGPRs = alignTo(MinNumAGPRs, 4); + + MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); + } + + // Clamp values to be inbounds of our limits, and ensure min <= max. + + MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); + MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); + + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); + + assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + "invalid register counts"); + } else if (hasMAIInsts()) { + // On gfx908 the number of AGPRs always equals the number of VGPRs. + MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 88a269f..785ede3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1667,6 +1667,10 @@ public: return getMaxNumVGPRs(F); } + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index dd3f2fe..520c321 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8d6c1d0..2aa6b4e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2ffb783..e042b59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -678,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9f61bf8..9509199 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector ReservedRegs = TRI->getReservedRegs(MF); BitVector NonWwmAllocMask(TRI->getNumRegs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future // to have a balanced allocation between WWM values and per-thread vector @@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, NumRegs = std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs); - auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); // Try to use the highest available registers for now. Later after // vgpr-regalloc, they can be shifted to the lowest range. unsigned I = 0; @@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, // Reserve an arbitrary register and report the error. TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); MF.getFunction().getContext().emitError( - "can't find enough VGPRs for wwm-regalloc"); + "cannot find enough VGPRs for wwm-regalloc"); } } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420..f1262e11 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -602,7 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2536,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2551,11 +2548,24 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2658,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2674,6 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2686,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 84cfa87..f3acc5c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } -std::pair<unsigned, unsigned> -SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { - const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); - - unsigned MaxNumVGPRs = MaxVectorRegs; - unsigned MaxNumAGPRs = 0; - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - unsigned MinNumAGPRs = 0; - const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; - - // TODO: Move this logic into subtarget on IR function - // - // TODO: The lower bound should probably force the number of required - // registers up, overriding amdgpu-waves-per-eu. - std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( - MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, - /*OnlyFirstRequired=*/true); - - if (MinNumAGPRs == DefaultNumAGPR.first) { - // Default to splitting half the registers if AGPRs are required. - MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - } else { - // Align to accum_offset's allocation granularity. - MinNumAGPRs = alignTo(MinNumAGPRs, 4); - - MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); - } - - // Clamp values to be inbounds of our limits, and ensure min <= max. - - MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); - MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); - MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); - - assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && - "invalid register counts"); - } else if (ST.hasMAIInsts()) { - // On gfx908 the number of AGPRs always equals the number of VGPRs. - MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; - } - - return std::pair(MaxNumVGPRs, MaxNumAGPRs); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0008e5f..5508f07 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,11 +90,6 @@ public: /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number - /// of waves per execution unit required for the function \p MF. - std::pair<unsigned, unsigned> - getMaxNumVectorRegs(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 50217c3..9e4dbec 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl( // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI.getParent()->getParent(); - // FIXME: Use Function::hasOptSize(). - if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize)) + if (MF->getFunction().hasOptSize()) --Latency; } return Latency; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fca5dff..066b392 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); @@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Legal); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::FTRUNC, MVT::f16, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); + setOperationAction(ISD::FRINT, MVT::f16, Legal); + setOperationAction(ISD::FFLOOR, MVT::f16, Legal); + setOperationAction(ISD::FCEIL, MVT::f16, Legal); } if (Subtarget->hasNEON()) { diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index c86fa2b..54c3cea 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); const Function &F = MF.getFunction(); - bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = F.hasOptSize(); // Combine aggressively (for code size) ShouldCombineAggressively = diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp index 6eccf80..9d7776d 100644 --- a/llvm/lib/Target/Hexagon/HexagonMask.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp @@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.hasOptSize()) return false; // Mask instruction is available only from v66 if (!HST.hasV66Ops()) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 613cfb5..d96136c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2385,13 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, return Res; } -static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) { - for (unsigned i = 0; i < Op->getNumOperands(); ++i) - if (isIntOrFPConstant(Op->getOperand(i))) - return true; - return false; -} - // Lower BUILD_VECTOR as broadcast load (if possible). // For example: // %a = load i8, ptr %ptr @@ -2441,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op); EVT ResTy = Op->getValueType(0); + unsigned NumElts = ResTy.getVectorNumElements(); SDLoc DL(Op); APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; + bool IsConstant = false; + bool UseSameConstant = true; + SDValue ConstantValue; bool Is128Vec = ResTy.is128BitVector(); bool Is256Vec = ResTy.is256BitVector(); @@ -2495,13 +2492,35 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, if (DAG.isSplatValue(Op, /*AllowUndefs=*/false)) return Op; - if (!isConstantBUILD_VECTOR(Node)) { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (isIntOrFPConstant(Opi)) { + IsConstant = true; + if (!ConstantValue.getNode()) + ConstantValue = Opi; + else if (ConstantValue != Opi) + UseSameConstant = false; + } + } + + // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits. + if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) { + SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (!isIntOrFPConstant(Opi)) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi, + DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + } + return Result; + } + + if (!IsConstant) { // Use INSERT_VECTOR_ELT operations rather than expand to stores. // The resulting code is the same length as the expansion, but it doesn't // use memory operations. assert(ResTy.isVector()); - unsigned NumElts = ResTy.getVectorNumElements(); SDValue Op0 = Node->getOperand(0); SDValue Vector = DAG.getUNDEF(ResTy); diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index ca03310..a2e48ab 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) { if (FS.empty() && M.size() && F->hasFnAttribute("target-features")) FS = F->getFnAttribute("target-features").getValueAsString(); + std::string strFS = FS.str(); + if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool()) + strFS += strFS.empty() ? "+soft-float" : ",+soft-float"; + // Compute MIPS architecture attributes based on the default subtarget // that we'd have constructed. // FIXME: For ifunc related functions we could iterate over and look // for a feature string that doesn't match the default one. StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM); - const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, - std::nullopt); + const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(), + MTM, std::nullopt); bool IsABICalls = STI.isABICalls(); const MipsABIInfo &ABI = MTM.getABI(); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 0e581a7..ec6b382 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Custom); - setOperationAction(ISD::ConstantFP, MVT::f64, Custom); - setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND, ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL, ISD::SIGN_EXTEND}); @@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); case ISD::READCYCLECOUNTER: return lowerREADCYCLECOUNTER(Op, DAG); - case ISD::ConstantFP: - return lowerConstantFP(Op, DAG); } return SDValue(); } @@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op, return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc); } -SDValue MipsTargetLowering::lowerConstantFP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getSimpleValueType(); - SDNode *N = Op.getNode(); - ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N); - - if (!CFP->isNaN() || Subtarget.isNaN2008()) { - return SDValue(); - } - - APFloat NaNValue = CFP->getValueAPF(); - auto &Sem = NaNValue.getSemantics(); - - // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN - // encodings, and one for sNaNs. Check every NaN constants and make sure - // they are correctly encoded for legacy encodings. - if (!NaNValue.isSignaling()) { - APFloat RealQNaN = NaNValue.getSNaN(Sem); - return DAG.getConstantFP(RealQNaN, DL, VT); - } - return SDValue(); -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 31ac5d4..c65c76c 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -592,7 +592,6 @@ class TargetRegisterClass; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 54845e5..607edd3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } +bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( + EVT ScalarTy) const { + if (!ScalarTy.isSimple()) + return false; + switch (ScalarTy.getSimpleVT().SimpleTy) { + case MVT::iPTR: + return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::f16: + case MVT::bf16: + case MVT::f32: + return true; + case MVT::i64: + case MVT::f64: + return Subtarget.hasVInstructionsI64(); + default: + return false; + } +} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalElementTypeForRVV(ScalarType)) + if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46..a788c0b7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,6 +384,7 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; + bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 30d8f85..3cbe668 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalElementTypeForRVV(VT.getScalarType()) || + if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99c..f0510ec 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 81de0a5..0164fcd 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,8 +97,6 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); -STATISTIC(MismatchedCloneAssignments, - "Number of callsites assigned to call multiple non-matching clones"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -2062,20 +2060,6 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } -// Return the clone number of the given function by extracting it from the -// memprof suffix. Assumes the caller has already confirmed it is a memprof -// clone. -static unsigned getMemProfCloneNum(const Function &F) { - assert(isMemProfClone(F)); - auto Pos = F.getName().find_last_of('.'); - assert(Pos > 0); - unsigned CloneNo; - bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); - assert(!Err); - (void)Err; - return CloneNo; -} - std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -3995,22 +3979,7 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); - auto NewCalleeCloneNo = CalleeFunc.cloneNo(); - if (isMemProfClone(*CurF)) { - // If we already assigned this callsite to call a specific non-default - // clone (i.e. not the original function which is clone 0), ensure that we - // aren't trying to now update it to call a different clone, which is - // indicative of a bug in the graph or function assignment. - auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); - if (CurCalleeCloneNo != NewCalleeCloneNo) { - LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " - << CurCalleeCloneNo << " now " << NewCalleeCloneNo - << "\n"); - MismatchedCloneAssignments++; - } - } - if (NewCalleeCloneNo > 0) + if (CalleeFunc.cloneNo() > 0) cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -4026,19 +3995,7 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - auto NewCalleeCloneNo = CalleeFunc.cloneNo(); - auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; - // If we already assigned this callsite to call a specific non-default - // clone (i.e. not the original function which is clone 0), ensure that we - // aren't trying to now update it to call a different clone, which is - // indicative of a bug in the graph or function assignment. - if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { - LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " - << CurCalleeCloneNo << " now " << NewCalleeCloneNo - << "\n"); - MismatchedCloneAssignments++; - } - CurCalleeCloneNo = NewCalleeCloneNo; + CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4746,19 +4703,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // where the callers were assigned to different clones of a function. } - auto FindFirstAvailFuncClone = [&]() { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) - return CF.first; - } - assert(false && - "Expected an available func clone for this callsite clone"); - }; - // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4875,7 +4819,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + // Find first function in FuncClonesToCallMap without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncClonesToCallMap size was smaller than the clone number. + for (auto &CF : FuncClonesToCallMap) { + if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { + FuncCloneAssignedToCurCallsiteClone = CF.first; + break; + } + } assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4889,31 +4842,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } - // If we didn't assign a function clone to this callsite clone yet, e.g. - // none of its callers has a non-null call, do the assignment here. - // We want to ensure that every callsite clone is assigned to some - // function clone, so that the call updates below work as expected. - // In particular if this is the original callsite, we want to ensure it - // is assigned to the original function, otherwise the original function - // will appear available for assignment to other callsite clones, - // leading to unintended effects. For one, the unknown and not updated - // callers will call into cloned paths leading to the wrong hints, - // because they still call the original function (clone 0). Also, - // because all callsites start out as being clone 0 by default, we can't - // easily distinguish between callsites explicitly assigned to clone 0 - // vs those never assigned, which can lead to multiple updates of the - // calls when invoking updateCall below, with mismatched clone values. - // TODO: Add a flag to the callsite nodes or some other mechanism to - // better distinguish and identify callsite clones that are not getting - // assigned to function clones as expected. - if (!FuncCloneAssignedToCurCallsiteClone) { - FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); - assert(FuncCloneAssignedToCurCallsiteClone && - "No available func clone for this callsite clone"); - AssignCallsiteCloneToFuncClone( - FuncCloneAssignedToCurCallsiteClone, Call, Clone, - /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); - } } if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(Node); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 033ef8b..a43a6ee 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc, auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0)); if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) && all_equal(Shuf->getShuffleMask()) && - Shuf->getType() == Shuf->getOperand(0)->getType()) { + ElementCount::isKnownGE(Shuf->getType()->getElementCount(), + cast<VectorType>(Shuf->getOperand(0)->getType()) + ->getElementCount())) { // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask // trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask - Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType()); + Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType( + Trunc.getType()->getScalarType()); + Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy); return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask()); } diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index f6780c0..ce1d9f1 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, if (DisableMemOPOPT) return false; - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index d6bd92d..b5eb647 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1176,6 +1176,28 @@ private: return true; } + /// This function fixes PHI nodes after fusion in \p SafeToSink. + /// \p SafeToSink instructions are the instructions that are to be moved past + /// the fused loop. Thus, the PHI nodes in \p SafeToSink should be updated to + /// receive values from the fused loop if they are currently taking values + /// from the first loop (i.e. FC0)'s latch. + void fixPHINodes(ArrayRef<Instruction *> SafeToSink, + const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + for (Instruction *Inst : SafeToSink) { + // No update needed for non-PHI nodes. + PHINode *Phi = dyn_cast<PHINode>(Inst); + if (!Phi) + continue; + for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) { + if (Phi->getIncomingBlock(I) != FC0.Latch) + continue; + assert(FC1.Latch && "FC1 latch is not set"); + Phi->setIncomingBlock(I, FC1.Latch); + } + } + } + /// Collect instructions in the \p FC1 Preheader that can be hoisted /// to the \p FC0 Preheader or sunk into the \p FC1 Body bool collectMovablePreheaderInsts( @@ -1481,6 +1503,9 @@ private: assert(I->getParent() == FC1.Preheader); I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt()); } + // PHI nodes in SinkInsts need to be updated to receive values from the + // fused loop. + fixPHINodes(SinkInsts, FC0, FC1); } /// Determine if two fusion candidates have identical guards diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index 9fe655e..fca09c6 100644 --- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) { static bool runImpl(Function &F, const TargetLibraryInfo &TLI, DominatorTree *DT) { - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); LibCallsShrinkWrap CCDCE(TLI, DTU); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index ddb062b..571fa11 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1257,7 +1257,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { assert(Diff->getType()->isIntegerTy() && "difference must be of integer type"); Value *DiffV = expand(Diff); - Value *BaseV = &PN; + Value *BaseV = fixupLCSSAFormFor(&PN); if (PhiTy->isPointerTy()) { if (STy->isPointerTy()) return Builder.CreatePtrAdd(BaseV, DiffV); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6616e61f..7b7efb8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1363,11 +1363,15 @@ public: TTI.hasActiveVectorLength() && !EnableVPlanNativePath; if (EVLIsLegal) return; - // If for some reason EVL mode is unsupported, fallback to - // DataWithoutLaneMask to try to vectorize the loop with folded tail - // in a generic way. - ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, - TailFoldingStyle::DataWithoutLaneMask}; + // If for some reason EVL mode is unsupported, fallback to a scalar epilogue + // if it's allowed, or DataWithoutLaneMask otherwise. + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) + ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None}; + else + ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, + TailFoldingStyle::DataWithoutLaneMask}; + LLVM_DEBUG( dbgs() << "LV: Preference for VP intrinsics indicated. Will " "not try to generate VP Intrinsics " @@ -4500,19 +4504,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; - if (MainLoopVF.isFixed()) { - // TODO: extend to support scalable VFs. - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(MainLoopVF).getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TC) && - "Trip count SCEV must be computable"); - RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); - - // No iterations left to process in the epilogue. - if (RemainingIterations->isZero()) - return Result; + const SCEV *TC = + vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable"); + RemainingIterations = + SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); + + // No iterations left to process in the epilogue. + if (RemainingIterations->isZero()) + return Result; + if (MainLoopVF.isFixed()) { MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, SE.getConstant(TCType, MaxTripCount))) { |