aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp47
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp3
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp34
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp3
-rw-r--r--llvm/lib/Object/IRSymtab.cpp11
-rw-r--r--llvm/lib/Support/Debug.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h5
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp11
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonMask.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp37
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp29
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp23
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp96
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp8
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp25
-rw-r--r--llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp36
36 files changed, 390 insertions, 310 deletions
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 3b3e7a4..a7c99b1 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2083,22 +2083,54 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
if (TBB == FBB) {
MBB->splice(Loc, TBB, TBB->begin(), TIB);
} else {
+ // Merge the debug locations, and hoist and kill the debug instructions from
+ // both branches. FIXME: We could probably try harder to preserve some debug
+ // instructions (but at least this isn't producing wrong locations).
+ MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc);
+ auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) {
+ assert(DI->isDebugInstr() && "Expected a debug instruction");
+ if (DI->isDebugRef()) {
+ const TargetInstrInfo *TII =
+ MBB->getParent()->getSubtarget().getInstrInfo();
+ const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE);
+ DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0,
+ DI->getDebugVariable(), DI->getDebugExpression());
+ MBB->insert(Loc, &*DI);
+ return;
+ }
+ // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF.
+ if (DI->isDebugPHI()) {
+ DI->eraseFromParent();
+ return;
+ }
+
+ DI->setDebugValueUndef();
+ DI->moveBefore(&*Loc);
+ };
+
// TIB and FIB point to the end of the regions to hoist/merge in TBB and
// FBB.
MachineBasicBlock::iterator FE = FIB;
MachineBasicBlock::iterator FI = FBB->begin();
for (MachineBasicBlock::iterator TI :
make_early_inc_range(make_range(TBB->begin(), TIB))) {
- // Move debug instructions and pseudo probes without modifying them.
- // FIXME: This is the wrong thing to do for debug locations, which
- // should at least be killed (and hoisted from BOTH blocks).
- if (TI->isDebugOrPseudoInstr()) {
- TI->moveBefore(&*Loc);
+ // Hoist and kill debug instructions from FBB. After this loop FI points
+ // to the next non-debug instruction to hoist (checked in assert after the
+ // TBB debug instruction handling code).
+ while (FI != FE && FI->isDebugInstr())
+ HoistAndKillDbgInstr(FI++);
+
+ // Kill debug instructions before moving.
+ if (TI->isDebugInstr()) {
+ HoistAndKillDbgInstr(TI);
continue;
}
- // Get the next non-meta instruction in FBB.
- FI = skipDebugInstructionsForward(FI, FE, false);
+ // FI and TI now point to identical non-debug instructions.
+ assert(FI != FE && "Unexpected end of FBB range");
+ // Pseudo probes are excluded from the range when identifying foldable
+ // instructions, so we don't expect to see one now.
+ assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range");
// NOTE: The loop above checks CheckKillDead but we can't do that here as
// it modifies some kill markers after the check.
assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) &&
@@ -2111,6 +2143,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
++FI;
}
}
+
FBB->erase(FBB->begin(), FIB);
if (UpdateLiveIns)
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 429a17a..7ea2512 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,8 +211,7 @@ void MachineFunction::init() {
ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
- // FIXME: Use Function::hasOptSize().
- if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.getAlign() && !F.hasOptSize())
Alignment = std::max(Alignment,
STI->getTargetLowering()->getPrefFunctionAlignment());
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d4a3455..68b8a00 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() {
ISD::SDIVFIX, ISD::SDIVFIXSAT,
ISD::UDIVFIX, ISD::UDIVFIXSAT,
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
- ISD::IS_FPCLASS},
+ ISD::IS_FPCLASS, ISD::FCBRT,
+ ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::FEXP,
+ ISD::FEXP2, ISD::FEXP10,
+ ISD::FFLOOR, ISD::FNEARBYINT,
+ ISD::FCEIL, ISD::FRINT,
+ ISD::FTRUNC, ISD::FROUNDEVEN,
+ ISD::FTAN, ISD::FACOS,
+ ISD::FASIN, ISD::FATAN,
+ ISD::FCOSH, ISD::FSINH,
+ ISD::FTANH, ISD::FATAN2},
VT, Expand);
// Overflow operations default to expand
@@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() {
// These operations default to expand for vector types.
if (VT.isVector())
- setOperationAction(
- {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
- ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
- ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
- ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
- ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
- VT, Expand);
+ setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+ ISD::ANY_EXTEND_VECTOR_INREG,
+ ISD::SIGN_EXTEND_VECTOR_INREG,
+ ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR,
+ ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+ VT, Expand);
// Constrained floating-point operations default to expand.
#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
@@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() {
{MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
Expand);
- // These library functions default to expand.
- setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
- ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
- ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
- ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN,
- ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH,
- ISD::FATAN2},
- {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
// Insert custom handling default for llvm.canonicalize.*.
setOperationAction(ISD::FCANONICALIZE,
{MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 408d07b..725e951 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1734,7 +1734,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
Name == getInstrProfSectionName(IPSK_covdata, Triple::COFF,
/*AddSegmentInfo=*/false) ||
Name == getInstrProfSectionName(IPSK_covname, Triple::COFF,
- /*AddSegmentInfo=*/false))
+ /*AddSegmentInfo=*/false) ||
+ Name == ".llvmbc" || Name == ".llvmcmd")
Kind = SectionKind::getMetadata();
int Selection = 0;
unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 2579fa3..0f19495 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -8,11 +8,11 @@
#include "llvm/Object/IRSymtab.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Comdat.h"
@@ -213,9 +213,10 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
return P.first->second;
}
-static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
- DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
- std::end(PreservedSymbols));
+static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
+ StringSet<> PreservedSymbolSet;
+ PreservedSymbolSet.insert(std::begin(PreservedSymbols),
+ std::end(PreservedSymbols));
// FIXME: Do we need to pass in ABI fields from TargetOptions?
RTLIB::RuntimeLibcallsInfo Libcalls(TT);
for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
@@ -280,7 +281,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
setStr(Sym.IRName, GV->getName());
- static const DenseSet<StringRef> PreservedSymbolsSet =
+ static const StringSet<> PreservedSymbolsSet =
buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 5bb04d0..b6f338f 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -24,11 +24,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/circular_raw_ostream.h"
#include "llvm/Support/raw_ostream.h"
+#include <utility>
#include "DebugOptions.h"
@@ -38,27 +40,62 @@
using namespace llvm;
+/// Parse a debug type string into a pair of the debug type and the debug level.
+/// The expected format is "type[:level]", where the level is an optional
+/// integer.
+static std::pair<std::string, std::optional<int>>
+parseDebugType(StringRef DbgType) {
+ std::optional<int> Level;
+ size_t ColonPos = DbgType.find(':');
+ if (ColonPos != StringRef::npos) {
+ StringRef LevelStr = DbgType.substr(ColonPos + 1);
+ DbgType = DbgType.take_front(ColonPos);
+ if (LevelStr.empty())
+ Level = 0;
+ else {
+ int parsedLevel;
+ if (to_integer(LevelStr, parsedLevel, 10))
+ Level = parsedLevel;
+ }
+ }
+ return std::make_pair(DbgType.str(), Level);
+}
+
// Even though LLVM might be built with NDEBUG, define symbols that the code
// built without NDEBUG can depend on via the llvm/Support/Debug.h header.
namespace llvm {
/// Exported boolean set by the -debug option.
bool DebugFlag = false;
-static ManagedStatic<std::vector<std::string>> CurrentDebugType;
+/// The current debug type and an optional debug level.
+/// The debug level is the verbosity of the debug output.
+/// 0 is a special level that acts as an opt-out for this specific debug type.
+/// If provided, the debug output is enabled only if the user specified a level
+/// at least as high as the provided level.
+static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>>
+ CurrentDebugType;
/// Return true if the specified string is the debug type
/// specified on the command line, or if none was specified on the command line
/// with the -debug-only=X option.
-bool isCurrentDebugType(const char *DebugType) {
+bool isCurrentDebugType(const char *DebugType, int Level) {
if (CurrentDebugType->empty())
return true;
+ // Track if there is at least one debug type with a level, this is used
+ // to allow to opt-out of some DebugType and leaving all the others enabled.
+ bool HasEnabledDebugType = false;
// See if DebugType is in list. Note: do not use find() as that forces us to
// unnecessarily create an std::string instance.
- for (auto &d : *CurrentDebugType) {
- if (d == DebugType)
+ for (auto &D : *CurrentDebugType) {
+ HasEnabledDebugType =
+ HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0);
+ if (D.first != DebugType)
+ continue;
+ if (!D.second.has_value())
return true;
+ return D.second >= Level;
}
- return false;
+ return !HasEnabledDebugType;
}
/// Set the current debug type, as if the -debug-only=X
@@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) {
void setCurrentDebugTypes(const char **Types, unsigned Count) {
CurrentDebugType->clear();
- llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count));
+ CurrentDebugType->reserve(Count);
+ for (const char *Type : ArrayRef(Types, Count))
+ CurrentDebugType->push_back(parseDebugType(Type));
}
+
} // namespace llvm
// All Debug.h functionality is a no-op in NDEBUG mode.
@@ -114,10 +154,10 @@ struct DebugOnlyOpt {
if (Val.empty())
return;
DebugFlag = true;
- SmallVector<StringRef,8> dbgTypes;
- StringRef(Val).split(dbgTypes, ',', -1, false);
- for (auto dbgType : dbgTypes)
- CurrentDebugType->push_back(std::string(dbgType));
+ SmallVector<StringRef, 8> DbgTypes;
+ StringRef(Val).split(DbgTypes, ',', -1, false);
+ for (auto DbgType : DbgTypes)
+ CurrentDebugType->push_back(parseDebugType(DbgType));
}
};
} // namespace
@@ -129,8 +169,13 @@ struct CreateDebugOnly {
static void *call() {
return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>(
"debug-only",
- cl::desc("Enable a specific type of debug output (comma separated list "
- "of types)"),
+ cl::desc(
+ "Enable a specific type of debug output (comma separated list "
+ "of types using the format \"type[:level]\", where the level "
+ "is an optional integer. The level can be set to 1, 2, 3, etc. to "
+ "control the verbosity of the output. Setting a debug-type level "
+ "to zero acts as an opt-out for this specific debug-type without "
+ "affecting the others."),
cl::Hidden, cl::value_desc("debug string"),
cl::location(DebugOnlyOptLoc), cl::ValueRequired);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b44..59cc1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,7 +13,6 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e3ca09e..f25ce87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Library functions. These default to Expand, but we have instructions
// for them.
setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
- ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
- MVT::f32, Legal);
+ ISD::FROUNDEVEN, ISD::FTRUNC},
+ {MVT::f16, MVT::f32}, Legal);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
- if (Subtarget->has16BitInsts())
+ if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
- else {
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+ } else {
setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 9a2bab1..0a0a107 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return getMaxNumVGPRs(MF.getFunction());
}
+std::pair<unsigned, unsigned>
+GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
+ const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
+
+ unsigned MaxNumVGPRs = MaxVectorRegs;
+ unsigned MaxNumAGPRs = 0;
+
+ // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+ // a wave may have up to 512 total vector registers combining together both
+ // VGPRs and AGPRs. Hence, in an entry function without calls and without
+ // AGPRs used within it, it is possible to use the whole vector register
+ // budget for VGPRs.
+ //
+ // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (hasGFX90AInsts()) {
+ unsigned MinNumAGPRs = 0;
+ const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
+ const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+ const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
+
+ // TODO: The lower bound should probably force the number of required
+ // registers up, overriding amdgpu-waves-per-eu.
+ std::tie(MinNumAGPRs, MaxNumAGPRs) =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
+ /*OnlyFirstRequired=*/true);
+
+ if (MinNumAGPRs == DefaultNumAGPR.first) {
+ // Default to splitting half the registers if AGPRs are required.
+ MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
+ } else {
+ // Align to accum_offset's allocation granularity.
+ MinNumAGPRs = alignTo(MinNumAGPRs, 4);
+
+ MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
+ }
+
+ // Clamp values to be inbounds of our limits, and ensure min <= max.
+
+ MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
+ MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
+
+ MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+ MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
+
+ assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
+ MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+ "invalid register counts");
+ } else if (hasMAIInsts()) {
+ // On gfx908 the number of AGPRs always equals the number of VGPRs.
+ MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
+ }
+
+ return std::pair(MaxNumVGPRs, MaxNumAGPRs);
+}
+
void GCNSubtarget::adjustSchedDependency(
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
const TargetSchedModel *SchedModel) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 88a269f..785ede3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1667,6 +1667,10 @@ public:
return getMaxNumVGPRs(F);
}
+ /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+ /// of waves per execution unit required for the function \p MF.
+ std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe..520c321 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,7 +552,7 @@ public:
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
- if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@@ -565,7 +565,6 @@ public:
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
- const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // SCRATCH instructions always access scratch.
- if (TII->isFLATScratch(MI))
- return true;
-
- // GLOBAL instructions never access scratch.
- if (TII->isFLATGlobal(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access scratch.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves scratch.
- return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
- unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- });
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8d6c1d0..2aa6b4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
+bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
+ if (!isFLAT(MI) || isFLATGlobal(MI))
+ return false;
+
+ // If scratch is not initialized, we can never access it.
+ if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+ return false;
+
+ // SCRATCH instructions always access scratch.
+ if (isFLATScratch(MI))
+ return true;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // TODO (?): Does this need to be taught how to read noalias.addrspace ?
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2ffb783..e042b59 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -678,6 +678,12 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
+ /// SCRATCH_ memory operands.
+ /// Conservatively correct; will return true if \p MI cannot be proven
+ /// to not hit scratch.
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9f61bf8..9509199 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
MachineRegisterInfo &MRI = MF.getRegInfo();
BitVector ReservedRegs = TRI->getReservedRegs(MF);
BitVector NonWwmAllocMask(TRI->getNumRegs());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
// to have a balanced allocation between WWM values and per-thread vector
@@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
NumRegs =
std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs);
- auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
// Try to use the highest available registers for now. Later after
// vgpr-regalloc, they can be shifted to the lowest range.
unsigned I = 0;
@@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
// Reserve an arbitrary register and report the error.
TRI->markSuperRegs(RegMask, AMDGPU::VGPR0);
MF.getFunction().getContext().emitError(
- "can't find enough VGPRs for wwm-regalloc");
+ "cannot find enough VGPRs for wwm-regalloc");
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420..f1262e11 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+ virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -602,7 +602,7 @@ public:
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+ bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2536,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
- if (Op == SIMemOp::STORE)
- Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2551,11 +2548,24 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::expandSystemScopeStore(
- MachineBasicBlock::iterator &MI) const {
- MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
- if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
- return insertWaitsBeforeSystemScopeStore(MI);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+
+ // GFX12.0 only: Extra waits needed before system scope stores.
+ if (!ST.hasGFX1250Insts()) {
+ if (!Atomic && Scope == CPol::SCOPE_SYS)
+ return insertWaitsBeforeSystemScopeStore(MI);
+ return false;
+ }
+
+ // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
+ // space.
+ if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
+ return setScope(MI, CPol::SCOPE_SE);
return false;
}
@@ -2658,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ // FIXME: Necessary hack because iterator can lose track of the store.
+ MachineInstr &StoreMI = *MI;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2674,6 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}
@@ -2686,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->expandSystemScopeStore(MI);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 84cfa87..f3acc5c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
-std::pair<unsigned, unsigned>
-SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
- const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
-
- unsigned MaxNumVGPRs = MaxVectorRegs;
- unsigned MaxNumAGPRs = 0;
-
- // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
- // a wave may have up to 512 total vector registers combining together both
- // VGPRs and AGPRs. Hence, in an entry function without calls and without
- // AGPRs used within it, it is possible to use the whole vector register
- // budget for VGPRs.
- //
- // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
- // register file accordingly.
- if (ST.hasGFX90AInsts()) {
- unsigned MinNumAGPRs = 0;
- const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
- const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
- const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
-
- // TODO: Move this logic into subtarget on IR function
- //
- // TODO: The lower bound should probably force the number of required
- // registers up, overriding amdgpu-waves-per-eu.
- std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
- MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
- /*OnlyFirstRequired=*/true);
-
- if (MinNumAGPRs == DefaultNumAGPR.first) {
- // Default to splitting half the registers if AGPRs are required.
- MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
- } else {
- // Align to accum_offset's allocation granularity.
- MinNumAGPRs = alignTo(MinNumAGPRs, 4);
-
- MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
- }
-
- // Clamp values to be inbounds of our limits, and ensure min <= max.
-
- MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
- MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
-
- MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
- MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
-
- assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
- MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
- "invalid register counts");
- } else if (ST.hasMAIInsts()) {
- // On gfx908 the number of AGPRs always equals the number of VGPRs.
- MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
- }
-
- return std::pair(MaxNumVGPRs, MaxNumAGPRs);
-}
-
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
@@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve VGPRs/AGPRs.
//
- auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
for (const TargetRegisterClass *RC : regclasses()) {
if (RC->isBaseClass() && isVGPRClass(RC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0008e5f..5508f07 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -90,11 +90,6 @@ public:
/// spilling is needed.
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
- /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
- /// of waves per execution unit required for the function \p MF.
- std::pair<unsigned, unsigned>
- getMaxNumVectorRegs(const MachineFunction &MF) const;
-
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 50217c3..9e4dbec 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl(
// instructions).
if (Latency > 0 && Subtarget.isThumb2()) {
const MachineFunction *MF = DefMI.getParent()->getParent();
- // FIXME: Use Function::hasOptSize().
- if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize))
+ if (MF->getFunction().hasOptSize())
--Latency;
}
return Latency;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fca5dff..066b392 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
+ setOperationAction(ISD::FRINT, MVT::f16, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f16, Legal);
}
if (Subtarget->hasNEON()) {
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index c86fa2b..54c3cea 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
const Function &F = MF.getFunction();
- bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize);
+ bool OptForSize = F.hasOptSize();
// Combine aggressively (for code size)
ShouldCombineAggressively =
diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp
index 6eccf80..9d7776d 100644
--- a/llvm/lib/Target/Hexagon/HexagonMask.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp
@@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) {
HII = HST.getInstrInfo();
const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.hasOptSize())
return false;
// Mask instruction is available only from v66
if (!HST.hasV66Ops())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 613cfb5..d96136c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2385,13 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
return Res;
}
-static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) {
- for (unsigned i = 0; i < Op->getNumOperands(); ++i)
- if (isIntOrFPConstant(Op->getOperand(i)))
- return true;
- return false;
-}
-
// Lower BUILD_VECTOR as broadcast load (if possible).
// For example:
// %a = load i8, ptr %ptr
@@ -2441,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
EVT ResTy = Op->getValueType(0);
+ unsigned NumElts = ResTy.getVectorNumElements();
SDLoc DL(Op);
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
+ bool IsConstant = false;
+ bool UseSameConstant = true;
+ SDValue ConstantValue;
bool Is128Vec = ResTy.is128BitVector();
bool Is256Vec = ResTy.is256BitVector();
@@ -2495,13 +2492,35 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
return Op;
- if (!isConstantBUILD_VECTOR(Node)) {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Opi = Node->getOperand(i);
+ if (isIntOrFPConstant(Opi)) {
+ IsConstant = true;
+ if (!ConstantValue.getNode())
+ ConstantValue = Opi;
+ else if (ConstantValue != Opi)
+ UseSameConstant = false;
+ }
+ }
+
+ // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits.
+ if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) {
+ SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Opi = Node->getOperand(i);
+ if (!isIntOrFPConstant(Opi))
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi,
+ DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
+ }
+ return Result;
+ }
+
+ if (!IsConstant) {
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
// The resulting code is the same length as the expansion, but it doesn't
// use memory operations.
assert(ResTy.isVector());
- unsigned NumElts = ResTy.getVectorNumElements();
SDValue Op0 = Node->getOperand(0);
SDValue Vector = DAG.getUNDEF(ResTy);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index ca03310..a2e48ab 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
if (FS.empty() && M.size() && F->hasFnAttribute("target-features"))
FS = F->getFnAttribute("target-features").getValueAsString();
+ std::string strFS = FS.str();
+ if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool())
+ strFS += strFS.empty() ? "+soft-float" : ",+soft-float";
+
// Compute MIPS architecture attributes based on the default subtarget
// that we'd have constructed.
// FIXME: For ifunc related functions we could iterate over and look
// for a feature string that doesn't match the default one.
StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
- const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM,
- std::nullopt);
+ const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(),
+ MTM, std::nullopt);
bool IsABICalls = STI.isABICalls();
const MipsABIInfo &ABI = MTM.getABI();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0e581a7..ec6b382 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
- setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
-
setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
ISD::SIGN_EXTEND});
@@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG);
case ISD::READCYCLECOUNTER:
return lowerREADCYCLECOUNTER(Op, DAG);
- case ISD::ConstantFP:
- return lowerConstantFP(Op, DAG);
}
return SDValue();
}
@@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
}
-SDValue MipsTargetLowering::lowerConstantFP(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getSimpleValueType();
- SDNode *N = Op.getNode();
- ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N);
-
- if (!CFP->isNaN() || Subtarget.isNaN2008()) {
- return SDValue();
- }
-
- APFloat NaNValue = CFP->getValueAPF();
- auto &Sem = NaNValue.getSemantics();
-
- // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN
- // encodings, and one for sNaNs. Check every NaN constants and make sure
- // they are correctly encoded for legacy encodings.
- if (!NaNValue.isSignaling()) {
- APFloat RealQNaN = NaNValue.getSNaN(Sem);
- return DAG.getConstantFP(RealQNaN, DL, VT);
- }
- return SDValue();
-}
-
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 31ac5d4..c65c76c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -592,7 +592,6 @@ class TargetRegisterClass;
SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
/// isEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 54845e5..607edd3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
}
}
+bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
+ EVT ScalarTy) const {
+ if (!ScalarTy.isSimple())
+ return false;
+ switch (ScalarTy.getSimpleVT().SimpleTy) {
+ case MVT::iPTR:
+ return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::f16:
+ case MVT::bf16:
+ case MVT::f32:
+ return true;
+ case MVT::i64:
+ case MVT::f64:
+ return Subtarget.hasVInstructionsI64();
+ default:
+ return false;
+ }
+}
unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
return NumRepeatedDivisors;
@@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
return false;
EVT ScalarType = DataType.getScalarType();
- if (!isLegalElementTypeForRVV(ScalarType))
+ if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
return false;
if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ca70c46..a788c0b7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -384,6 +384,7 @@ public:
bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;
bool isLegalElementTypeForRVV(EVT ScalarTy) const;
+ bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 30d8f85..3cbe668 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
if (!isTypeLegal(VT))
return false;
- if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
+ if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
Alignment))
return false;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d62d99c..f0510ec 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -265,7 +265,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
+ return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@@ -297,7 +297,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
+ return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 81de0a5..0164fcd 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -97,8 +97,6 @@ STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
-STATISTIC(MismatchedCloneAssignments,
- "Number of callsites assigned to call multiple non-matching clones");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -2062,20 +2060,6 @@ static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}
-// Return the clone number of the given function by extracting it from the
-// memprof suffix. Assumes the caller has already confirmed it is a memprof
-// clone.
-static unsigned getMemProfCloneNum(const Function &F) {
- assert(isMemProfClone(F));
- auto Pos = F.getName().find_last_of('.');
- assert(Pos > 0);
- unsigned CloneNo;
- bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
- assert(!Err);
- (void)Err;
- return CloneNo;
-}
-
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -3995,22 +3979,7 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
- auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
- auto NewCalleeCloneNo = CalleeFunc.cloneNo();
- if (isMemProfClone(*CurF)) {
- // If we already assigned this callsite to call a specific non-default
- // clone (i.e. not the original function which is clone 0), ensure that we
- // aren't trying to now update it to call a different clone, which is
- // indicative of a bug in the graph or function assignment.
- auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
- if (CurCalleeCloneNo != NewCalleeCloneNo) {
- LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
- << CurCalleeCloneNo << " now " << NewCalleeCloneNo
- << "\n");
- MismatchedCloneAssignments++;
- }
- }
- if (NewCalleeCloneNo > 0)
+ if (CalleeFunc.cloneNo() > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
@@ -4026,19 +3995,7 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
- auto NewCalleeCloneNo = CalleeFunc.cloneNo();
- auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
- // If we already assigned this callsite to call a specific non-default
- // clone (i.e. not the original function which is clone 0), ensure that we
- // aren't trying to now update it to call a different clone, which is
- // indicative of a bug in the graph or function assignment.
- if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
- LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
- << CurCalleeCloneNo << " now " << NewCalleeCloneNo
- << "\n");
- MismatchedCloneAssignments++;
- }
- CurCalleeCloneNo = NewCalleeCloneNo;
+ CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
}
// Update the debug information attached to NewFunc to use the clone Name. Note
@@ -4746,19 +4703,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// where the callers were assigned to different clones of a function.
}
- auto FindFirstAvailFuncClone = [&]() {
- // Find first function in FuncClonesToCallMap without an assigned
- // clone of this callsite Node. We should always have one
- // available at this point due to the earlier cloning when the
- // FuncClonesToCallMap size was smaller than the clone number.
- for (auto &CF : FuncClonesToCallMap) {
- if (!FuncCloneToCurNodeCloneMap.count(CF.first))
- return CF.first;
- }
- assert(false &&
- "Expected an available func clone for this callsite clone");
- };
-
// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
@@ -4875,7 +4819,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
- FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
+ // Find first function in FuncClonesToCallMap without an assigned
+ // clone of this callsite Node. We should always have one
+ // available at this point due to the earlier cloning when the
+ // FuncClonesToCallMap size was smaller than the clone number.
+ for (auto &CF : FuncClonesToCallMap) {
+ if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
+ FuncCloneAssignedToCurCallsiteClone = CF.first;
+ break;
+ }
+ }
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
@@ -4889,31 +4842,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneAssignedToCurCallsiteClone);
}
}
- // If we didn't assign a function clone to this callsite clone yet, e.g.
- // none of its callers has a non-null call, do the assignment here.
- // We want to ensure that every callsite clone is assigned to some
- // function clone, so that the call updates below work as expected.
- // In particular if this is the original callsite, we want to ensure it
- // is assigned to the original function, otherwise the original function
- // will appear available for assignment to other callsite clones,
- // leading to unintended effects. For one, the unknown and not updated
- // callers will call into cloned paths leading to the wrong hints,
- // because they still call the original function (clone 0). Also,
- // because all callsites start out as being clone 0 by default, we can't
- // easily distinguish between callsites explicitly assigned to clone 0
- // vs those never assigned, which can lead to multiple updates of the
- // calls when invoking updateCall below, with mismatched clone values.
- // TODO: Add a flag to the callsite nodes or some other mechanism to
- // better distinguish and identify callsite clones that are not getting
- // assigned to function clones as expected.
- if (!FuncCloneAssignedToCurCallsiteClone) {
- FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
- assert(FuncCloneAssignedToCurCallsiteClone &&
- "No available func clone for this callsite clone");
- AssignCallsiteCloneToFuncClone(
- FuncCloneAssignedToCurCallsiteClone, Call, Clone,
- /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
- }
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 033ef8b..a43a6ee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) &&
all_equal(Shuf->getShuffleMask()) &&
- Shuf->getType() == Shuf->getOperand(0)->getType()) {
+ ElementCount::isKnownGE(Shuf->getType()->getElementCount(),
+ cast<VectorType>(Shuf->getOperand(0)->getType())
+ ->getElementCount())) {
// trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask
// trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask
- Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+ Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType(
+ Trunc.getType()->getScalarType());
+ Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy);
return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask());
}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index f6780c0..ce1d9f1 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
if (DisableMemOPOPT)
return false;
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
MemOPSizeOpt.perform();
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d..b5eb647 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1176,6 +1176,28 @@ private:
return true;
}
+ /// This function fixes PHI nodes after fusion in \p SafeToSink.
+ /// \p SafeToSink instructions are the instructions that are to be moved past
+ /// the fused loop. Thus, the PHI nodes in \p SafeToSink should be updated to
+ /// receive values from the fused loop if they are currently taking values
+ /// from the first loop (i.e. FC0)'s latch.
+ void fixPHINodes(ArrayRef<Instruction *> SafeToSink,
+ const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ for (Instruction *Inst : SafeToSink) {
+ // No update needed for non-PHI nodes.
+ PHINode *Phi = dyn_cast<PHINode>(Inst);
+ if (!Phi)
+ continue;
+ for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+ if (Phi->getIncomingBlock(I) != FC0.Latch)
+ continue;
+ assert(FC1.Latch && "FC1 latch is not set");
+ Phi->setIncomingBlock(I, FC1.Latch);
+ }
+ }
+ }
+
/// Collect instructions in the \p FC1 Preheader that can be hoisted
/// to the \p FC0 Preheader or sunk into the \p FC1 Body
bool collectMovablePreheaderInsts(
@@ -1481,6 +1503,9 @@ private:
assert(I->getParent() == FC1.Preheader);
I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt());
}
+ // PHI nodes in SinkInsts need to be updated to receive values from the
+ // fused loop.
+ fixPHINodes(SinkInsts, FC0, FC1);
}
/// Determine if two fusion candidates have identical guards
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9fe655e..fca09c6 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) {
static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
DominatorTree *DT) {
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
LibCallsShrinkWrap CCDCE(TLI, DTU);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index ddb062b..571fa11 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1257,7 +1257,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
assert(Diff->getType()->isIntegerTy() &&
"difference must be of integer type");
Value *DiffV = expand(Diff);
- Value *BaseV = &PN;
+ Value *BaseV = fixupLCSSAFormFor(&PN);
if (PhiTy->isPointerTy()) {
if (STy->isPointerTy())
return Builder.CreatePtrAdd(BaseV, DiffV);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6616e61f..7b7efb8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,11 +1363,15 @@ public:
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
if (EVLIsLegal)
return;
- // If for some reason EVL mode is unsupported, fallback to
- // DataWithoutLaneMask to try to vectorize the loop with folded tail
- // in a generic way.
- ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
- TailFoldingStyle::DataWithoutLaneMask};
+ // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
+ // if it's allowed, or DataWithoutLaneMask otherwise.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+ else
+ ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask};
+
LLVM_DEBUG(
dbgs() << "LV: Preference for VP intrinsics indicated. Will "
"not try to generate VP Intrinsics "
@@ -4500,19 +4504,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
- if (MainLoopVF.isFixed()) {
- // TODO: extend to support scalable VFs.
- const SCEV *TC = vputils::getSCEVExprForVPValue(
- getPlanFor(MainLoopVF).getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TC) &&
- "Trip count SCEV must be computable");
- RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
-
- // No iterations left to process in the epilogue.
- if (RemainingIterations->isZero())
- return Result;
+ const SCEV *TC =
+ vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+ RemainingIterations =
+ SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+
+ // No iterations left to process in the epilogue.
+ if (RemainingIterations->isZero())
+ return Result;
+ if (MainLoopVF.isFixed()) {
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {