aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/InlineCost.cpp2
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp3
-rw-r--r--llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp5
-rw-r--r--llvm/lib/CodeGen/TailDuplicator.cpp6
-rw-r--r--llvm/lib/IR/AsmWriter.cpp4
-rw-r--r--llvm/lib/LTO/LTO.cpp6
-rw-r--r--llvm/lib/ObjCopy/ELF/ELFObject.h12
-rw-r--r--llvm/lib/ObjectYAML/GOFFEmitter.cpp2
-rw-r--r--llvm/lib/Support/APFloat.cpp5
-rw-r--r--llvm/lib/Support/SpecialCaseList.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64BranchTargets.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp89
-rw-r--r--llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp13
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMProcessors.td11
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp43
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td155
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp86
-rw-r--r--llvm/lib/TargetParser/Host.cpp1
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp16
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp20
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp27
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp28
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h10
31 files changed, 482 insertions, 125 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index c4fee39..5169b43 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -1242,7 +1242,7 @@ public:
return std::nullopt;
}
- virtual ~InlineCostCallAnalyzer() = default;
+ ~InlineCostCallAnalyzer() override = default;
int getThreshold() const { return Threshold; }
int getCost() const { return Cost; }
int getStaticBonusApplied() const { return StaticBonusApplied; }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7adb25d..e27a9b1 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2982,6 +2982,10 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
if (!StrideExpr)
return;
+ if (auto *Unknown = dyn_cast<SCEVUnknown>(StrideExpr))
+ if (isa<UndefValue>(Unknown->getValue()))
+ return;
+
LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
"versioning:");
LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 12a784e..11ca48d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -18,8 +18,7 @@ using namespace llvm;
unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) {
resetUsedFlag(true);
- auto IterBool =
- Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS)));
+ auto IterBool = Pool.try_emplace(Sym, Pool.size(), TLS);
return IterBool.first->second.Number;
}
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 93ae548..7bef3a8 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -86,10 +86,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
template <> struct std::hash<VarLocInsertPt> {
- using argument_type = VarLocInsertPt;
- using result_type = std::size_t;
-
- result_type operator()(const argument_type &Arg) const {
+ std::size_t operator()(const VarLocInsertPt &Arg) const {
return std::hash<void *>()(Arg.getOpaqueValue());
}
};
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 8e48d19..109444b 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -363,7 +363,7 @@ void TailDuplicator::processPHI(
Register SrcReg = MI->getOperand(SrcOpIdx).getReg();
unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
- LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
+ LocalVRMap.try_emplace(DefReg, SrcReg, SrcSubReg);
// Insert a copy from source to the end of the block. The def register is the
// available value liveout of the block.
@@ -411,7 +411,7 @@ void TailDuplicator::duplicateInstruction(
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
Register NewReg = MRI->createVirtualRegister(RC);
MO.setReg(NewReg);
- LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+ LocalVRMap.try_emplace(Reg, NewReg, 0);
if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
addSSAUpdateEntry(Reg, NewReg, PredBB);
continue;
@@ -463,7 +463,7 @@ void TailDuplicator::duplicateInstruction(
NewReg)
.addReg(VI->second.Reg, 0, VI->second.SubReg);
LocalVRMap.erase(VI);
- LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+ LocalVRMap.try_emplace(Reg, NewReg, 0);
MO.setReg(NewReg);
// The composed VI.Reg:VI.SubReg is replaced with NewReg, which
// is equivalent to the whole register Reg. Hence, Reg:subreg
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 1096e57..3c222f5 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -843,7 +843,7 @@ public:
SlotTracker(const SlotTracker &) = delete;
SlotTracker &operator=(const SlotTracker &) = delete;
- ~SlotTracker() = default;
+ ~SlotTracker() override = default;
void setProcessHook(
std::function<void(AbstractSlotTrackerStorage *, const Module *, bool)>);
@@ -5323,7 +5323,7 @@ struct MDTreeAsmWriterContext : public AsmWriterContext {
--Level;
}
- ~MDTreeAsmWriterContext() {
+ ~MDTreeAsmWriterContext() override {
for (const auto &Entry : Buffer) {
MainOS << "\n";
unsigned NumIndent = Entry.first * 2U;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 4bc2a18..b618222 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1711,7 +1711,7 @@ public:
/*ShouldEmitImportsFiles=*/false),
IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {}
- virtual Error runThinLTOBackendThread(
+ Error runThinLTOBackendThread(
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
@@ -2271,8 +2271,8 @@ public:
RemoteCompilerPrependArgs(RemoteCompilerPrependArgs),
RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {}
- virtual void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset,
- llvm::Triple Triple) override {
+ void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset,
+ llvm::Triple Triple) override {
UID = itostr(sys::Process::getProcessId());
Jobs.resize((size_t)ThinLTONumTasks);
this->ThinLTOTaskOffset = ThinLTOTaskOffset;
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 7ec0e9b..4f6473f 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -109,7 +109,7 @@ protected:
WritableMemoryBuffer &Out;
public:
- virtual ~SectionWriter() = default;
+ ~SectionWriter() override = default;
Error visit(const Section &Sec) override;
Error visit(const OwnedDataSection &Sec) override;
@@ -134,7 +134,7 @@ private:
using Elf_Sym = typename ELFT::Sym;
public:
- virtual ~ELFSectionWriter() {}
+ ~ELFSectionWriter() override {}
Error visit(const SymbolTableSection &Sec) override;
Error visit(const RelocationSection &Sec) override;
Error visit(const GnuDebugLinkSection &Sec) override;
@@ -180,7 +180,7 @@ public:
class BinarySectionWriter : public SectionWriter {
public:
- virtual ~BinarySectionWriter() {}
+ ~BinarySectionWriter() override {}
Error visit(const SymbolTableSection &Sec) override;
Error visit(const RelocationSection &Sec) override;
@@ -346,7 +346,7 @@ private:
size_t totalSize() const;
public:
- virtual ~ELFWriter() {}
+ ~ELFWriter() override {}
bool WriteSectionHeaders;
// For --only-keep-debug, select an alternative section/segment layout
@@ -367,7 +367,7 @@ private:
uint64_t TotalSize = 0;
public:
- ~BinaryWriter() {}
+ ~BinaryWriter() override {}
Error finalize() override;
Error write() override;
BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config)
@@ -784,7 +784,7 @@ private:
SymbolTableSection *Symbols = nullptr;
public:
- virtual ~SectionIndexSection() {}
+ ~SectionIndexSection() override {}
void addIndex(uint32_t Index) {
assert(Size > 0);
Indexes.push_back(Index);
diff --git a/llvm/lib/ObjectYAML/GOFFEmitter.cpp b/llvm/lib/ObjectYAML/GOFFEmitter.cpp
index c26893c..82800b1 100644
--- a/llvm/lib/ObjectYAML/GOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/GOFFEmitter.cpp
@@ -71,7 +71,7 @@ public:
SetBufferSize(GOFF::PayloadLength);
}
- ~GOFFOstream() { finalize(); }
+ ~GOFFOstream() override { finalize(); }
void makeNewRecord(GOFF::RecordType Type, size_t Size) {
fillRecord();
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 4787604..e21cf8e 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -5354,7 +5354,7 @@ APInt DoubleAPFloat::bitcastToAPInt() const {
Floats[0].bitcastToAPInt().getRawData()[0],
Floats[1].bitcastToAPInt().getRawData()[0],
};
- return APInt(128, 2, Data);
+ return APInt(128, Data);
}
Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
@@ -5643,8 +5643,7 @@ APFloat::opStatus DoubleAPFloat::convertFromUnsignedParts(
// Create a minimally-sized APInt to represent the source value.
const unsigned SrcBitWidth = SrcMSB + 1;
- APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth,
- /*numWords=*/SrcCount, Src},
+ APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth, ArrayRef(Src, SrcCount)},
/*isUnsigned=*/true};
// Stage 1: Initial Approximation.
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 3a97185..246d90c 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -94,6 +94,19 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
StringRef Prefix = G.Pattern.prefix();
StringRef Suffix = G.Pattern.suffix();
+ if (Suffix.empty() && Prefix.empty()) {
+ // If both prefix and suffix are empty put into special tree to search by
+ // substring in a middle.
+ StringRef Substr = G.Pattern.longest_substr();
+ if (!Substr.empty()) {
+ // But only if substring is not empty. Searching this tree is more
+ // expensive.
+ auto &V = SubstrToGlob.emplace(Substr).first->second;
+ V.emplace_back(&G);
+ continue;
+ }
+ }
+
auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
V.emplace_back(&G);
@@ -119,6 +132,25 @@ void SpecialCaseList::GlobMatcher::match(
}
}
}
+
+ if (!SubstrToGlob.empty()) {
+ // As we don't know when substring exactly starts, we will try all
+ // possibilities. In most cases search will fail on first characters.
+ for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) {
+ for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) {
+ for (const auto *G : V) {
+ if (G->Pattern.match(Query)) {
+ Cb(G->Name, G->LineNo);
+ // As soon as we find a match in the vector, we can break for this
+ // vector, since the globs are already sorted by priority within the
+ // prefix group. However, we continue searching other prefix groups
+ // in the map, as they may contain a better match overall.
+ break;
+ }
+ }
+ }
+ }
+ }
}
SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 137ff89..f13554f 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -47,6 +47,8 @@ public:
StringRef getPassName() const override { return AARCH64_BRANCH_TARGETS_NAME; }
private:
+ const AArch64Subtarget *Subtarget;
+
void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump,
bool NeedsWinCFI);
};
@@ -75,6 +77,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
<< "********** Function: " << MF.getName() << '\n');
const Function &F = MF.getFunction();
+ Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+
// LLVM does not consider basic blocks which are the targets of jump tables
// to be address-taken (the address can't escape anywhere else), but they are
// used for indirect branches, so need BTI instructions.
@@ -100,9 +104,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
// a BTI, and pointing the indirect branch at that. For non-ELF targets we
// can't rely on that, so we assume that `CouldCall` is _always_ true due
// to the risk of long-branch thunks at link time.
- if (&MBB == &*MF.begin() &&
- (!MF.getSubtarget<AArch64Subtarget>().isTargetELF() ||
- (F.hasAddressTaken() || !F.hasLocalLinkage())))
+ if (&MBB == &*MF.begin() && (!Subtarget->isTargetELF() ||
+ (F.hasAddressTaken() || !F.hasLocalLinkage())))
CouldCall = true;
// If the block itself is address-taken, it could be indirectly branched
@@ -132,9 +135,6 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
<< (CouldCall ? "c" : "") << " to " << MBB.getName()
<< "\n");
- const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
- MBB.getParent()->getSubtarget().getInstrInfo());
-
unsigned HintNum = 32;
if (CouldCall)
HintNum |= 2;
@@ -162,6 +162,8 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
MBBI->getOpcode() == AArch64::PACIBSP))
return;
+ const AArch64InstrInfo *TII = Subtarget->getInstrInfo();
+
// Insert BTI exactly at the first executable instruction.
const DebugLoc DL = MBB.findDebugLoc(MBBI);
MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT))
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1e607f4..f63981b 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1871,7 +1871,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
}
bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
bool Modified = false;
for (auto &MBB : MF)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c76689f..0f7b34c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -644,10 +644,10 @@ bool AArch64FrameLowering::hasReservedCallFrame(
MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
- const AArch64TargetLowering *TLI =
- MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64InstrInfo *TII = Subtarget.getInstrInfo();
+ const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
[[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
DebugLoc DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
@@ -1319,8 +1319,8 @@ StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF,
// TODO: This function currently does not work for scalable vectors.
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
int FI) const {
- const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
+ const AArch64RegisterInfo *RegInfo =
+ MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
? getFPOffset(MF, ObjectOffset).getFixed()
@@ -1343,10 +1343,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
TargetStackID::Value StackID, Register &FrameReg, bool PreferFP,
bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
- const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
@@ -1466,7 +1465,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
return FPOffset;
}
FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
- : (unsigned)AArch64::SP;
+ : MCRegister(AArch64::SP);
return SPOffset;
}
@@ -1589,8 +1588,8 @@ static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
namespace {
struct RegPairInfo {
- unsigned Reg1 = AArch64::NoRegister;
- unsigned Reg2 = AArch64::NoRegister;
+ Register Reg1;
+ Register Reg2;
int FrameIdx;
int Offset;
enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
@@ -1598,21 +1597,21 @@ struct RegPairInfo {
RegPairInfo() = default;
- bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+ bool isPaired() const { return Reg2.isValid(); }
bool isScalable() const { return Type == PPR || Type == ZPR; }
};
} // end anonymous namespace
-unsigned findFreePredicateReg(BitVector &SavedRegs) {
+MCRegister findFreePredicateReg(BitVector &SavedRegs) {
for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
if (SavedRegs.test(PReg)) {
unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
- return PNReg;
+ return MCRegister(PNReg);
}
}
- return AArch64::NoRegister;
+ return MCRegister();
}
// The multivector LD/ST are available only for SME or SVE2p1 targets
@@ -1930,8 +1929,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
bool PTrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
- unsigned Reg1 = RPI.Reg1;
- unsigned Reg2 = RPI.Reg2;
+ Register Reg1 = RPI.Reg1;
+ Register Reg2 = RPI.Reg2;
unsigned StrOpc;
// Issue sequence of spills for cs regs. The first spill may be converted
@@ -1967,7 +1966,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
break;
}
- unsigned X0Scratch = AArch64::NoRegister;
+ Register X0Scratch;
auto RestoreX0 = make_scope_exit([&] {
if (X0Scratch != AArch64::NoRegister)
BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0)
@@ -2009,11 +2008,15 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
}
- LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
- if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
- dbgs() << ") -> fi#(" << RPI.FrameIdx;
- if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
- dbgs() << ")\n");
+ LLVM_DEBUG({
+ dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired())
+ dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx + 1;
+ dbgs() << ")\n";
+ });
assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
"Windows unwdinding requires a consecutive (FP,LR) pair");
@@ -2143,8 +2146,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
bool PTrueCreated = false;
for (const RegPairInfo &RPI : RegPairs) {
- unsigned Reg1 = RPI.Reg1;
- unsigned Reg2 = RPI.Reg2;
+ Register Reg1 = RPI.Reg1;
+ Register Reg2 = RPI.Reg2;
// Issue sequence of restores for cs regs. The last restore may be converted
// to a post-increment load later by emitEpilogue if the callee-save stack
@@ -2176,11 +2179,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
case RegPairInfo::VG:
continue;
}
- LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
- if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
- dbgs() << ") -> fi#(" << RPI.FrameIdx;
- if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
- dbgs() << ")\n");
+ LLVM_DEBUG({
+ dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
+ if (RPI.isPaired())
+ dbgs() << ", " << printReg(Reg2, TRI);
+ dbgs() << ") -> fi#(" << RPI.FrameIdx;
+ if (RPI.isPaired())
+ dbgs() << ", " << RPI.FrameIdx + 1;
+ dbgs() << ")\n";
+ });
// Windows unwind codes require consecutive registers if registers are
// paired. Make the switch here, so that the code below will save (x,x+1)
@@ -2435,8 +2442,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned UnspilledCSGPR = AArch64::NoRegister;
unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2444,9 +2450,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineFrameInfo &MFI = MF.getFrameInfo();
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
- unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
- ? RegInfo->getBaseRegister()
- : (unsigned)AArch64::NoRegister;
+ MCRegister BasePointerReg =
+ RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister();
unsigned ExtraCSSpill = 0;
bool HasUnpairedGPR64 = false;
@@ -2456,7 +2461,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Figure out which callee-saved registers to save/restore.
for (unsigned i = 0; CSRegs[i]; ++i) {
- const unsigned Reg = CSRegs[i];
+ const MCRegister Reg = CSRegs[i];
// Add the base pointer register to SavedRegs if it is callee-save.
if (Reg == BasePointerReg)
@@ -2470,7 +2475,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
}
bool RegUsed = SavedRegs.test(Reg);
- unsigned PairedReg = AArch64::NoRegister;
+ MCRegister PairedReg;
const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
AArch64::FPR128RegClass.contains(Reg)) {
@@ -2522,8 +2527,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// Find a suitable predicate register for the multi-vector spill/fill
// instructions.
- unsigned PnReg = findFreePredicateReg(SavedRegs);
- if (PnReg != AArch64::NoRegister)
+ MCRegister PnReg = findFreePredicateReg(SavedRegs);
+ if (PnReg.isValid())
AFI->setPredicateRegForFillSpill(PnReg);
// If no free callee-save has been found assign one.
if (!AFI->getPredicateRegForFillSpill() &&
@@ -2558,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned PPRCSStackSize = 0;
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
for (unsigned Reg : SavedRegs.set_bits()) {
- auto *RC = TRI->getMinimalPhysRegClass(Reg);
+ auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg));
assert(RC && "expected register class!");
auto SpillSize = TRI->getSpillSize(*RC);
bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
@@ -2600,7 +2605,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
LLVM_DEBUG({
dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
for (unsigned Reg : SavedRegs.set_bits())
- dbgs() << ' ' << printReg(Reg, RegInfo);
+ dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo);
dbgs() << "\n";
});
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index d67182d..03dd1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -649,7 +649,7 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
}
bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
bool Modified = false;
for (auto &MBB : MF)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 79975b0..5bfb19d9 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -620,7 +620,7 @@ AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
return RC;
}
-unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+MCRegister AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 47d76f3..3b0f4f6 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -124,7 +124,7 @@ public:
bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
bool hasBasePointer(const MachineFunction &MF) const;
- unsigned getBaseRegister() const;
+ MCRegister getBaseRegister() const;
bool isArgumentRegister(const MachineFunction &MF,
MCRegister Reg) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index d695f26..b4a4f4c 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -33,6 +33,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -49,8 +50,8 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Pass.h"
-#include <unordered_map>
#include <map>
+#include <unordered_map>
using namespace llvm;
@@ -67,7 +68,7 @@ namespace {
struct AArch64SIMDInstrOpt : public MachineFunctionPass {
static char ID;
- const TargetInstrInfo *TII;
+ const AArch64InstrInfo *TII;
MachineRegisterInfo *MRI;
TargetSchedModel SchedModel;
@@ -694,13 +695,9 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- TII = MF.getSubtarget().getInstrInfo();
MRI = &MF.getRegInfo();
- const TargetSubtargetInfo &ST = MF.getSubtarget();
- const AArch64InstrInfo *AAII =
- static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
- if (!AAII)
- return false;
+ const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
+ TII = ST.getInstrInfo();
SchedModel.init(&ST);
if (!SchedModel.hasInstrSchedModel())
return false;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 5c3e26e..4cd51d6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI,
} else
return false;
- std::string Str;
llvm::transform(Name, Name.begin(), ::tolower);
O << '\t' << Ins << '\t' << Reg.str() << ", " << Name;
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index 7453727..b60569e 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -421,6 +421,17 @@ def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline,
FeatureMVEVectorCostFactor1,
HasMVEFloatOps]>;
+def : ProcessorModel<"star-mc3", CortexM55Model, [ARMv81mMainline,
+ FeatureDSP,
+ FeatureFPARMv8_D16,
+ FeatureHasNoBranchPredictor,
+ FeaturePACBTI,
+ FeatureUseMISched,
+ FeaturePreferBranchAlign32,
+ FeatureHasSlowFPVMLx,
+ FeatureMVEVectorCostFactor1,
+ HasMVEFloatOps]>;
+
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
FeatureHWDivARM,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ca4a655..80c96c6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1701,6 +1701,43 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
DAG.getConstant(Imm, DL, GRLenVT));
}
+/// Lower VECTOR_SHUFFLE whose result is the reversed source vector.
+///
+/// It is possible to do optimization for VECTOR_SHUFFLE performing vector
+/// reverse whose mask likes:
+/// <7, 6, 5, 4, 3, 2, 1, 0>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue
+lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
+ // Only vectors with i8/i16 elements which cannot match other patterns
+ // directly needs to do this.
+ if (VT != MVT::v16i8 && VT != MVT::v8i16 && VT != MVT::v32i8 &&
+ VT != MVT::v16i16)
+ return SDValue();
+
+ if (!ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+ return SDValue();
+
+ int WidenNumElts = VT.getVectorNumElements() / 4;
+ SmallVector<int, 16> WidenMask(WidenNumElts, -1);
+ for (int i = 0; i < WidenNumElts; ++i)
+ WidenMask[i] = WidenNumElts - 1 - i;
+
+ MVT WidenVT = MVT::getVectorVT(
+ VT.getVectorElementType() == MVT::i8 ? MVT::i32 : MVT::i64, WidenNumElts);
+ SDValue NewV1 = DAG.getBitcast(WidenVT, V1);
+ SDValue WidenRev = DAG.getVectorShuffle(WidenVT, DL, NewV1,
+ DAG.getUNDEF(WidenVT), WidenMask);
+
+ return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT,
+ DAG.getBitcast(VT, WidenRev),
+ DAG.getConstant(27, DL, Subtarget.getGRLenVT()));
+}
+
/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
///
/// VPACKEV interleaves the even elements from each vector.
@@ -2004,6 +2041,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
if ((Result =
lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
return Result;
+ if ((Result =
+ lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
+ return Result;
// TODO: This comment may be enabled in the future to better match the
// pattern for instruction selection.
@@ -2622,6 +2662,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
+ if ((Result =
+ lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
+ return Result;
// TODO: This comment may be enabled in the future to better match the
// pattern for instruction selection.
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 598735f..c923f0e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1082,6 +1082,161 @@ let Predicates = [hasPTX<70>, hasSM<80>] in {
"mbarrier.pending_count.b64",
[(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>;
}
+
+class MBAR_UTIL<string op, string scope,
+ string space = "", string sem = "",
+ bit tl = 0, bit parity = 0> {
+ // The mbarrier instructions in PTX ISA are of the general form:
+ // mbarrier.op.semantics.scope.space.b64 arg1, arg2 ...
+ // where:
+ // op -> arrive, expect_tx, complete_tx, arrive.expect_tx etc.
+ // semantics -> acquire, release, relaxed (default depends on the op)
+ // scope -> cta or cluster (default is cta-scope)
+ // space -> shared::cta or shared::cluster (default is shared::cta)
+ //
+ // The 'semantics' and 'scope' go together. If one is specified,
+ // then the other _must_ be specified. For example:
+ // (A) mbarrier.arrive <args> (valid, release and cta are default)
+ // (B) mbarrier.arrive.release.cta <args> (valid, sem/scope mentioned explicitly)
+ // (C) mbarrier.arrive.release <args> (invalid, needs scope)
+ // (D) mbarrier.arrive.cta <args> (invalid, needs order)
+ //
+ // Wherever possible, we prefer form (A) to (B) since it is available
+ // from early PTX versions. In most cases, explicitly specifying the
+ // scope requires a later version of PTX.
+ string _scope_asm = !cond(
+ !eq(scope, "scope_cluster") : "cluster",
+ !eq(scope, "scope_cta") : !if(!empty(sem), "", "cta"),
+ true : scope);
+ string _space_asm = !cond(
+ !eq(space, "space_cta") : "shared",
+ !eq(space, "space_cluster") : "shared::cluster",
+ true : space);
+
+ string _parity = !if(parity, "parity", "");
+ string asm_str = StrJoin<".", ["mbarrier", op, _parity,
+ sem, _scope_asm, _space_asm, "b64"]>.ret;
+
+ string _intr_suffix = StrJoin<"_", [!subst(".", "_", op), _parity,
+ !if(tl, "tl", ""),
+ sem, scope, space]>.ret;
+ string intr_name = "int_nvvm_mbarrier_" # _intr_suffix;
+
+ // Predicate checks:
+ // These are used only for the "test_wait/try_wait" variants as they
+ // have evolved since sm80 and are complex. The predicates for the
+ // remaining instructions are straightforward and have already been
+ // applied directly.
+ Predicate _sm_pred = !cond(!or(
+ !eq(op, "try_wait"),
+ !eq(scope, "scope_cluster"),
+ !eq(sem, "relaxed")) : hasSM<90>,
+ true : hasSM<80>);
+ Predicate _ptx_pred = !cond(
+ !eq(sem, "relaxed") : hasPTX<86>,
+ !ne(_scope_asm, "") : hasPTX<80>,
+ !eq(op, "try_wait") : hasPTX<78>,
+ parity : hasPTX<71>,
+ true : hasPTX<70>);
+ list<Predicate> preds = [_ptx_pred, _sm_pred];
+}
+
+foreach op = ["expect_tx", "complete_tx"] in {
+ foreach scope = ["scope_cta", "scope_cluster"] in {
+ foreach space = ["space_cta", "space_cluster"] in {
+ defvar intr = !cast<Intrinsic>(MBAR_UTIL<op, scope, space>.intr_name);
+ defvar suffix = StrJoin<"_", [op, scope, space]>.ret;
+ def mbar_ # suffix : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$tx_count),
+ MBAR_UTIL<op, scope, space, "relaxed">.asm_str,
+ [(intr addr:$addr, i32:$tx_count)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ } // space
+ } // scope
+} // op
+
+multiclass MBAR_ARR_INTR<string op, string scope, string sem,
+ list<Predicate> pred = []> {
+ // When either of sem or scope is non-default, both have to
+ // be explicitly specified. So, explicitly state that
+ // sem is `release` when scope is `cluster`.
+ defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")),
+ "release", sem);
+
+ defvar asm_cta = MBAR_UTIL<op, scope, "space_cta", asm_sem>.asm_str;
+ defvar intr_cta = !cast<Intrinsic>(MBAR_UTIL<op, scope,
+ "space_cta", sem>.intr_name);
+
+ defvar asm_cluster = MBAR_UTIL<op, scope, "space_cluster", asm_sem>.asm_str;
+ defvar intr_cluster = !cast<Intrinsic>(MBAR_UTIL<op, scope,
+ "space_cluster", sem>.intr_name);
+
+ def _CTA : NVPTXInst<(outs B64:$state),
+ (ins ADDR:$addr, B32:$tx_count),
+ asm_cta # " $state, [$addr], $tx_count;",
+ [(set i64:$state, (intr_cta addr:$addr, i32:$tx_count))]>,
+ Requires<pred>;
+ def _CLUSTER : NVPTXInst<(outs),
+ (ins ADDR:$addr, B32:$tx_count),
+ asm_cluster # " _, [$addr], $tx_count;",
+ [(intr_cluster addr:$addr, i32:$tx_count)]>,
+ Requires<pred>;
+}
+foreach op = ["arrive", "arrive.expect_tx",
+ "arrive_drop", "arrive_drop.expect_tx"] in {
+ foreach scope = ["scope_cta", "scope_cluster"] in {
+ defvar suffix = !subst(".", "_", op) # scope;
+ defm mbar_ # suffix # _release : MBAR_ARR_INTR<op, scope, "", [hasPTX<80>, hasSM<90>]>;
+ defm mbar_ # suffix # _relaxed : MBAR_ARR_INTR<op, scope, "relaxed", [hasPTX<86>, hasSM<90>]>;
+ } // scope
+} // op
+
+multiclass MBAR_WAIT_INTR<string op, string scope, string sem, bit time_limit> {
+ // When either of sem or scope is non-default, both have to
+ // be explicitly specified. So, explicitly state that the
+ // semantics is `acquire` when the scope is `cluster`.
+ defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")),
+ "acquire", sem);
+
+ defvar asm_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+ time_limit, 1>.asm_str;
+ defvar pred_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+ time_limit, 1>.preds;
+ defvar intr_parity = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta",
+ sem, time_limit, 1>.intr_name);
+
+ defvar asm_state = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+ time_limit>.asm_str;
+ defvar pred_state = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+ time_limit>.preds;
+ defvar intr_state = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta",
+ sem, time_limit>.intr_name);
+
+ defvar ins_tl_dag = !if(time_limit, (ins B32:$tl), (ins));
+ defvar tl_suffix = !if(time_limit, ", $tl;", ";");
+ defvar intr_state_dag = !con((intr_state addr:$addr, i64:$state),
+ !if(time_limit, (intr_state i32:$tl), (intr_state)));
+ defvar intr_parity_dag = !con((intr_parity addr:$addr, i32:$phase),
+ !if(time_limit, (intr_parity i32:$tl), (intr_parity)));
+
+ def _STATE : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B64:$state), ins_tl_dag),
+ asm_state # " $res, [$addr], $state" # tl_suffix,
+ [(set i1:$res, intr_state_dag)]>,
+ Requires<pred_state>;
+ def _PARITY : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B32:$phase), ins_tl_dag),
+ asm_parity # " $res, [$addr], $phase" # tl_suffix,
+ [(set i1:$res, intr_parity_dag)]>,
+ Requires<pred_parity>;
+}
+foreach op = ["test_wait", "try_wait"] in {
+ foreach scope = ["scope_cta", "scope_cluster"] in {
+ foreach time_limit = !if(!eq(op, "try_wait"), [true, false], [false]) in {
+ defvar suffix = StrJoin<"_", [op, scope, !if(time_limit, "tl", "")]>.ret;
+ defm mbar_ # suffix # "_acquire" : MBAR_WAIT_INTR<op, scope, "", time_limit>;
+ defm mbar_ # suffix # "_relaxed" : MBAR_WAIT_INTR<op, scope, "relaxed", time_limit>;
+ } // time_limit
+ } // scope
+} // op
+
//-----------------------------------
// Math Functions
//-----------------------------------
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 53633ea..8198173 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -92,6 +92,8 @@ private:
void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,
MachineIRBuilder &MIB) const;
bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
+ bool selectIntrinsicWithSideEffects(MachineInstr &I,
+ MachineIRBuilder &MIB) const;
ComplexRendererFns selectShiftMask(MachineOperand &Root,
unsigned ShiftWidth) const;
@@ -714,6 +716,88 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
return GenericOpc;
}
+bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
+ MachineInstr &I, MachineIRBuilder &MIB) const {
+ // Find the intrinsic ID.
+ unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
+ // Select the instruction.
+ switch (IntrinID) {
+ default:
+ return false;
+ case Intrinsic::riscv_vlm:
+ case Intrinsic::riscv_vle:
+ case Intrinsic::riscv_vle_mask:
+ case Intrinsic::riscv_vlse:
+ case Intrinsic::riscv_vlse_mask: {
+ bool IsMasked = IntrinID == Intrinsic::riscv_vle_mask ||
+ IntrinID == Intrinsic::riscv_vlse_mask;
+ bool IsStrided = IntrinID == Intrinsic::riscv_vlse ||
+ IntrinID == Intrinsic::riscv_vlse_mask;
+ LLT VT = MRI->getType(I.getOperand(0).getReg());
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+ // Result vector
+ const Register DstReg = I.getOperand(0).getReg();
+
+ // Sources
+ bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm;
+ unsigned CurOp = 2;
+ SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+ // Passthru
+ if (HasPassthruOperand) {
+ auto PassthruReg = I.getOperand(CurOp++).getReg();
+ SrcOps.push_back(PassthruReg);
+ } else {
+ SrcOps.push_back(Register(RISCV::NoRegister));
+ }
+
+ // Base Pointer
+ auto PtrReg = I.getOperand(CurOp++).getReg();
+ SrcOps.push_back(PtrReg);
+
+ // Stride
+ if (IsStrided) {
+ auto StrideReg = I.getOperand(CurOp++).getReg();
+ SrcOps.push_back(StrideReg);
+ }
+
+ // Mask
+ if (IsMasked) {
+ auto MaskReg = I.getOperand(CurOp++).getReg();
+ SrcOps.push_back(MaskReg);
+ }
+
+ RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+ const RISCV::VLEPseudo *P =
+ RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW,
+ static_cast<unsigned>(LMUL));
+
+ auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps);
+
+ // Select VL
+ auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+ for (auto &RenderFn : *VLOpFn)
+ RenderFn(PseudoMI);
+
+ // SEW
+ PseudoMI.addImm(Log2SEW);
+
+ // Policy
+ uint64_t Policy = RISCVVType::MASK_AGNOSTIC;
+ if (IsMasked)
+ Policy = I.getOperand(CurOp++).getImm();
+ PseudoMI.addImm(Policy);
+
+ // Memref
+ PseudoMI.cloneMemRefs(I);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+ }
+ }
+}
+
bool RISCVInstructionSelector::select(MachineInstr &MI) {
MachineIRBuilder MIB(MI);
@@ -984,6 +1068,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI);
}
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ return selectIntrinsicWithSideEffects(MI, MIB);
default:
return false;
}
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 6065575..c8d1938 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -369,6 +369,7 @@ getHostCPUNameForARMFromComponents(StringRef Implementer, StringRef Hardware,
if (Implementer == "0x63") { // Arm China.
return StringSwitch<const char *>(Part)
.Case("0x132", "star-mc1")
+ .Case("0xd25", "star-mc3")
.Default("generic");
}
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 6d16599..5048561 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1044,15 +1044,13 @@ struct AAPointerInfoImpl
return AAPointerInfo::manifest(A);
}
- virtual const_bin_iterator begin() const override { return State::begin(); }
- virtual const_bin_iterator end() const override { return State::end(); }
- virtual int64_t numOffsetBins() const override {
- return State::numOffsetBins();
- }
- virtual bool reachesReturn() const override {
+ const_bin_iterator begin() const override { return State::begin(); }
+ const_bin_iterator end() const override { return State::end(); }
+ int64_t numOffsetBins() const override { return State::numOffsetBins(); }
+ bool reachesReturn() const override {
return !ReturnedOffsets.isUnassigned();
}
- virtual void addReturnedOffsetsTo(OffsetInfo &OI) const override {
+ void addReturnedOffsetsTo(OffsetInfo &OI) const override {
if (ReturnedOffsets.isUnknown()) {
OI.setUnknown();
return;
@@ -6653,7 +6651,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
: AAHeapToStack(IRP, A) {}
- ~AAHeapToStackFunction() {
+ ~AAHeapToStackFunction() override {
// Ensure we call the destructor so we release any memory allocated in the
// sets.
for (auto &It : AllocationInfos)
@@ -8374,7 +8372,7 @@ struct AAMemoryLocationImpl : public AAMemoryLocation {
AccessKind2Accesses.fill(nullptr);
}
- ~AAMemoryLocationImpl() {
+ ~AAMemoryLocationImpl() override {
// The AccessSets are allocated via a BumpPtrAllocator, we call
// the destructor manually.
for (AccessSet *AS : AccessKind2Accesses)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 5e2247f..d7eb745 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2693,7 +2693,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
: AAExecutionDomain(IRP, A) {}
- ~AAExecutionDomainFunction() { delete RPOT; }
+ ~AAExecutionDomainFunction() override { delete RPOT; }
void initialize(Attributor &A) override {
Function *F = getAnchorScope();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ede73f8..9c75d9a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -72,7 +72,7 @@ public:
: InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI,
PSI, DL, RPOT) {}
- virtual ~InstCombinerImpl() = default;
+ ~InstCombinerImpl() override = default;
/// Perform early cleanup and prepare the InstCombine worklist.
bool prepareWorklist(Function &F);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9c8de45..67f837c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3358,21 +3358,21 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (TyAllocSize == 1) {
// Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) to (bitcast Y),
- // but only if the result pointer is only used as if it were an integer,
- // or both point to the same underlying object (otherwise provenance is
- // not necessarily retained).
+ // but only if the result pointer is only used as if it were an integer.
+ // (The case where the underlying object is the same is handled by
+ // InstSimplify.)
Value *X = GEP.getPointerOperand();
Value *Y;
- if (match(GEP.getOperand(1),
- m_Sub(m_PtrToInt(m_Value(Y)), m_PtrToInt(m_Specific(X)))) &&
+ if (match(GEP.getOperand(1), m_Sub(m_PtrToIntOrAddr(m_Value(Y)),
+ m_PtrToIntOrAddr(m_Specific(X)))) &&
GEPType == Y->getType()) {
- bool HasSameUnderlyingObject =
- getUnderlyingObject(X) == getUnderlyingObject(Y);
+ bool HasNonAddressBits =
+ DL.getAddressSizeInBits(AS) != DL.getPointerSizeInBits(AS);
bool Changed = false;
GEP.replaceUsesWithIf(Y, [&](Use &U) {
- bool ShouldReplace = HasSameUnderlyingObject ||
- isa<ICmpInst>(U.getUser()) ||
- isa<PtrToIntInst>(U.getUser());
+ bool ShouldReplace = isa<PtrToAddrInst>(U.getUser()) ||
+ (!HasNonAddressBits &&
+ isa<ICmpInst, PtrToIntInst>(U.getUser()));
Changed |= ShouldReplace;
return ShouldReplace;
});
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index d831c27..c537be5c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7551,6 +7551,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
/// log2(C)-indexed value table (instead of traditionally emitting a load of the
/// address of the jump target, and indirectly jump to it).
static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
+ DomTreeUpdater *DTU,
const DataLayout &DL,
const TargetTransformInfo &TTI) {
Value *Condition = SI->getCondition();
@@ -7573,12 +7574,6 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
if (SI->getNumCases() < 4)
return false;
- // We perform this optimization only for switches with
- // unreachable default case.
- // This assumtion will save us from checking if `Condition` is a power of two.
- if (!SI->defaultDestUnreachable())
- return false;
-
// Check that switch cases are powers of two.
SmallVector<uint64_t, 4> Values;
for (const auto &Case : SI->cases()) {
@@ -7598,6 +7593,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
Builder.SetInsertPoint(SI);
+ if (!SI->defaultDestUnreachable()) {
+ // Let non-power-of-two inputs jump to the default case, when the latter is
+ // reachable.
+ auto *PopC = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, Condition);
+ auto *IsPow2 = Builder.CreateICmpEQ(PopC, ConstantInt::get(CondTy, 1));
+
+ auto *OrigBB = SI->getParent();
+ auto *DefaultCaseBB = SI->getDefaultDest();
+ BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
+ auto It = OrigBB->getTerminator()->getIterator();
+ BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ It->eraseFromParent();
+
+ addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB);
+ if (DTU)
+ DTU->applyUpdates({{DominatorTree::Insert, OrigBB, DefaultCaseBB}});
+ }
+
// Replace each case with its trailing zeros number.
for (auto &Case : SI->cases()) {
auto *OrigValue = Case.getCaseValue();
@@ -7953,7 +7966,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
Options.ConvertSwitchToLookupTable))
return requestResimplify();
- if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
+ if (simplifySwitchOfPowersOfTwo(SI, Builder, DTU, DL, TTI))
return requestResimplify();
if (reduceSwitchRange(SI, Builder, DL, TTI))
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cdb9e7e..4fcaf6d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17641,12 +17641,28 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
[](Value *V) {
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
})) ||
- all_of(E->Scalars, [&](Value *V) {
- return isa<PoisonValue>(V) ||
- (E->Idx == 0 && isa<InsertElementInst>(V)) ||
- E->isCopyableElement(V) ||
- (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
- }))
+ (all_of(E->Scalars,
+ [&](Value *V) {
+ return isa<PoisonValue>(V) ||
+ (E->Idx == 0 && isa<InsertElementInst>(V)) ||
+ E->isCopyableElement(V) ||
+ (!isVectorLikeInstWithConstOps(V) &&
+ isUsedOutsideBlock(V));
+ }) &&
+ (!E->doesNotNeedToSchedule() ||
+ any_of(E->Scalars,
+ [&](Value *V) {
+ if (!isa<Instruction>(V) ||
+ (E->hasCopyableElements() && E->isCopyableElement(V)))
+ return false;
+ return !areAllOperandsNonInsts(V);
+ }) ||
+ none_of(E->Scalars, [&](Value *V) {
+ if (!isa<Instruction>(V) ||
+ (E->hasCopyableElements() && E->isCopyableElement(V)))
+ return false;
+ return MustGather.contains(V);
+ }))))
Res = FindLastInst();
else
Res = FindFirstInst();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2591df8..5b9f005 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -398,7 +398,7 @@ public:
DebugLoc DL = DebugLoc::getUnknown())
: VPDef(SC), VPUser(Operands), DL(DL) {}
- virtual ~VPRecipeBase() = default;
+ ~VPRecipeBase() override = default;
/// Clone the current recipe.
virtual VPRecipeBase *clone() = 0;
@@ -576,7 +576,7 @@ public:
return R && classof(R);
}
- virtual VPSingleDefRecipe *clone() override = 0;
+ VPSingleDefRecipe *clone() override = 0;
/// Returns the underlying instruction.
Instruction *getUnderlyingInstr() {
@@ -907,7 +907,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
return R && classof(R);
}
- virtual VPRecipeWithIRFlags *clone() override = 0;
+ VPRecipeWithIRFlags *clone() override = 0;
static inline bool classof(const VPSingleDefRecipe *U) {
auto *R = dyn_cast<VPRecipeBase>(U);
@@ -2068,7 +2068,7 @@ public:
return classof(static_cast<const VPRecipeBase *>(R));
}
- virtual void execute(VPTransformState &State) override = 0;
+ void execute(VPTransformState &State) override = 0;
/// Returns the step value of the induction.
VPValue *getStepValue() { return getOperand(1); }
@@ -2557,7 +2557,7 @@ public:
VPCostContext &Ctx) const override;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+ bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
/// Returns the number of stored operands of this interleave group. Returns 0
/// for load interleave groups.