aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/InlineCost.cpp6
-rw-r--r--llvm/lib/Analysis/LoopCacheAnalysis.cpp7
-rw-r--r--llvm/lib/Analysis/MemoryLocation.cpp28
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp20
-rw-r--r--llvm/lib/CGData/CodeGenDataReader.cpp2
-rw-r--r--llvm/lib/CGData/StableFunctionMap.cpp1
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp21
-rw-r--r--llvm/lib/CodeGen/CMakeLists.txt1
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp12
-rw-r--r--llvm/lib/CodeGen/InlineSpiller.cpp3
-rw-r--r--llvm/lib/CodeGen/MIR2Vec.cpp91
-rw-r--r--llvm/lib/CodeGen/MachineBlockHashInfo.cpp115
-rw-r--r--llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp15
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp26
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp3
-rw-r--r--llvm/lib/CodeGen/TargetPassConfig.cpp8
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp4
-rw-r--r--llvm/lib/Frontend/HLSL/CBuffer.cpp9
-rw-r--r--llvm/lib/IR/Verifier.cpp15
-rw-r--r--llvm/lib/LTO/LTO.cpp15
-rw-r--r--llvm/lib/MC/MCAsmInfoELF.cpp2
-rw-r--r--llvm/lib/MC/MCObjectFileInfo.cpp4
-rw-r--r--llvm/lib/MC/MCParser/ELFAsmParser.cpp2
-rw-r--r--llvm/lib/Object/ELF.cpp1
-rw-r--r--llvm/lib/ObjectYAML/ELFYAML.cpp1
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp81
-rw-r--r--llvm/lib/Passes/PassRegistry.def32
-rw-r--r--llvm/lib/Support/GlobPattern.cpp67
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td24
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td13
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.cpp434
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.h11
-rw-r--r--llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp69
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h6
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td30
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.cpp12
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td20
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp51
-rw-r--r--llvm/lib/Target/RISCV/RISCVCombine.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td34
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp31
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td36
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp4
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp31
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp25
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp23
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/MergeICmps.cpp34
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp67
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp196
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h22
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
75 files changed, 1526 insertions, 410 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 757f689..c4fee39 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -751,7 +751,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
if (CA.analyze().isSuccess()) {
// We were able to inline the indirect call! Subtract the cost from the
// threshold to get the bonus we want to apply, but don't go below zero.
- Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+ addCost(-std::max(0, CA.getThreshold() - CA.getCost()));
}
} else
// Otherwise simply add the cost for merely making the call.
@@ -1191,7 +1191,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
// If this function uses the coldcc calling convention, prefer not to inline
// it.
if (F.getCallingConv() == CallingConv::Cold)
- Cost += InlineConstants::ColdccPenalty;
+ addCost(InlineConstants::ColdccPenalty);
LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
@@ -2193,7 +2193,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
// the cost of inlining it drops dramatically. It may seem odd to update
// Cost in updateThreshold, but the bonus depends on the logic in this method.
if (isSoleCallToLocalFunction(Call, F)) {
- Cost -= LastCallToStaticBonus;
+ addCost(-LastCallToStaticBonus);
StaticBonusApplied = LastCallToStaticBonus;
}
}
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 050c327..424a7fe 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -436,10 +436,9 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
if (StepRec && SE.isKnownNegative(StepRec))
- AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
- SE.getNegativeSCEV(StepRec),
- AccessFnAR->getLoop(),
- AccessFnAR->getNoWrapFlags());
+ AccessFn = SE.getAddRecExpr(
+ AccessFnAR->getStart(), SE.getNegativeSCEV(StepRec),
+ AccessFnAR->getLoop(), SCEV::NoWrapFlags::FlagAnyWrap);
const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
Subscripts.push_back(Div);
Sizes.push_back(ElemSize);
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 1c5f08e..edca387 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -288,6 +288,34 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
LocationSize::precise(DL.getTypeStoreSize(
II->getArgOperand(1)->getType())),
AATags);
+ case Intrinsic::matrix_column_major_load:
+ case Intrinsic::matrix_column_major_store: {
+ bool IsLoad = II->getIntrinsicID() == Intrinsic::matrix_column_major_load;
+ assert(ArgIdx == (IsLoad ? 0 : 1) && "Invalid argument index");
+
+ auto *Stride = dyn_cast<ConstantInt>(II->getArgOperand(IsLoad ? 1 : 2));
+ uint64_t Rows =
+ cast<ConstantInt>(II->getArgOperand(IsLoad ? 3 : 4))->getZExtValue();
+ uint64_t Cols =
+ cast<ConstantInt>(II->getArgOperand(IsLoad ? 4 : 5))->getZExtValue();
+
+ // The stride is dynamic, so there's nothing we can say.
+ if (!Stride)
+ return MemoryLocation(Arg, LocationSize::afterPointer(), AATags);
+
+ uint64_t ConstStride = Stride->getZExtValue();
+ auto *VT = cast<VectorType>(IsLoad ? II->getType()
+ : II->getArgOperand(0)->getType());
+ assert(Cols != 0 && "Matrix cannot have 0 columns");
+ TypeSize Size = DL.getTypeAllocSize(VT->getScalarType()) *
+ (ConstStride * (Cols - 1) + Rows);
+
+ // In the unstrided case, we have a precise size, ...
+ if (ConstStride == Rows)
+ return MemoryLocation(Arg, LocationSize::precise(Size), AATags);
+ // otherwise we merely obtain an upper bound.
+ return MemoryLocation(Arg, LocationSize::upperBound(Size), AATags);
+ }
}
assert(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623..c47a1c1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1001,13 +1001,25 @@ InstructionCost TargetTransformInfo::getShuffleCost(
TargetTransformInfo::PartialReductionExtendKind
TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
- if (isa<SExtInst>(I))
- return PR_SignExtend;
- if (isa<ZExtInst>(I))
- return PR_ZeroExtend;
+ if (auto *Cast = dyn_cast<CastInst>(I))
+ return getPartialReductionExtendKind(Cast->getOpcode());
return PR_None;
}
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(
+ Instruction::CastOps CastOpc) {
+ switch (CastOpc) {
+ case Instruction::CastOps::ZExt:
+ return PR_ZeroExtend;
+ case Instruction::CastOps::SExt:
+ return PR_SignExtend;
+ default:
+ return PR_None;
+ }
+ llvm_unreachable("Unhandled cast opcode");
+}
+
TTI::CastContextHint
TargetTransformInfo::getCastContextHint(const Instruction *I) {
if (!I)
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index b1cd939..aeb4a4d 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -125,7 +125,7 @@ Error IndexedCodeGenDataReader::read() {
FunctionMapRecord.setReadStableFunctionMapNames(
IndexedCodeGenDataReadFunctionMapNames);
if (IndexedCodeGenDataLazyLoading)
- FunctionMapRecord.lazyDeserialize(SharedDataBuffer,
+ FunctionMapRecord.lazyDeserialize(std::move(SharedDataBuffer),
Header.StableFunctionMapOffset);
else
FunctionMapRecord.deserialize(Ptr);
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index 46e04bd..d0fae3a 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -137,6 +137,7 @@ size_t StableFunctionMap::size(SizeType Type) const {
const StableFunctionMap::StableFunctionEntries &
StableFunctionMap::at(HashFuncsMapType::key_type FunctionHash) const {
auto It = HashToFuncs.find(FunctionHash);
+ assert(It != HashToFuncs.end() && "FunctionHash not found!");
if (isLazilyLoaded())
deserializeLazyLoadingEntry(It);
return It->second.Entries;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index fefde64f..8aa488f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -41,6 +41,7 @@
#include "llvm/CodeGen/GCMetadataPrinter.h"
#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockHashInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -184,6 +185,8 @@ static cl::opt<bool> PrintLatency(
cl::desc("Print instruction latencies as verbose asm comments"), cl::Hidden,
cl::init(false));
+extern cl::opt<bool> EmitBBHash;
+
STATISTIC(EmittedInsts, "Number of machine instrs printed");
char AsmPrinter::ID = 0;
@@ -474,6 +477,8 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GCModuleInfo>();
AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
+ if (EmitBBHash)
+ AU.addRequired<MachineBlockHashInfo>();
}
bool AsmPrinter::doInitialization(Module &M) {
@@ -1434,14 +1439,11 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
"BB entries info is required for BBFreq and BrProb "
"features");
}
- return {FuncEntryCountEnabled,
- BBFreqEnabled,
- BrProbEnabled,
+ return {FuncEntryCountEnabled, BBFreqEnabled, BrProbEnabled,
MF.hasBBSections() && NumMBBSectionRanges > 1,
// Use static_cast to avoid breakage of tests on windows.
- static_cast<bool>(BBAddrMapSkipEmitBBEntries),
- HasCalls,
- false};
+ static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls,
+ static_cast<bool>(EmitBBHash)};
}
void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
@@ -1500,6 +1502,9 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
PrevMBBEndSymbol = MBBSymbol;
}
+ auto MBHI =
+ Features.BBHash ? &getAnalysis<MachineBlockHashInfo>() : nullptr;
+
if (!Features.OmitBBEntries) {
OutStreamer->AddComment("BB id");
// Emit the BB ID for this basic block.
@@ -1527,6 +1532,10 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), CurrentLabel);
// Emit the Metadata.
OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+ // Emit the Hash.
+ if (MBHI) {
+ OutStreamer->emitInt64(MBHI->getMBBHash(MBB));
+ }
}
PrevMBBEndSymbol = MBB.getEndSymbol();
}
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index b6872605..4373c53 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -108,6 +108,7 @@ add_llvm_component_library(LLVMCodeGen
LowerEmuTLS.cpp
MachineBasicBlock.cpp
MachineBlockFrequencyInfo.cpp
+ MachineBlockHashInfo.cpp
MachineBlockPlacement.cpp
MachineBranchProbabilityInfo.cpp
MachineCFGPrinter.cpp
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 2b5ced3..f44eb22 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1108,8 +1108,8 @@ public:
};
} // namespace
-ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel)
- : TM(TM), OptLevel(OptLevel) {}
+ExpandFpPass::ExpandFpPass(const TargetMachine &TM, CodeGenOptLevel OptLevel)
+ : TM(&TM), OptLevel(OptLevel) {}
void ExpandFpPass::printPipeline(
raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 1fe38d6..b49040b 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1862,15 +1862,19 @@ bool IRTranslator::translateVectorDeinterleave2Intrinsic(
void IRTranslator::getStackGuard(Register DstReg,
MachineIRBuilder &MIRBuilder) {
+ Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
+ if (!Global) {
+ LLVMContext &Ctx = MIRBuilder.getContext();
+ Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+ MIRBuilder.buildUndef(DstReg);
+ return;
+ }
+
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
MRI->setRegClass(DstReg, TRI->getPointerRegClass());
auto MIB =
MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
- Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
- if (!Global)
- return;
-
unsigned AddrSpace = Global->getType()->getPointerAddressSpace();
LLT PtrTy = LLT::pointer(AddrSpace, DL->getPointerSizeInBits(AddrSpace));
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index d6e8505..c3e0964 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -721,6 +721,9 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Allocate a new register for the remat.
Register NewVReg = Edit->createFrom(Original);
+ // Constrain it to the register class of MI.
+ MRI.constrainRegClass(NewVReg, MRI.getRegClass(VirtReg.reg()));
+
// Finally we can rematerialize OrigMI before MI.
SlotIndex DefIdx =
Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 75ca06a..00b37e7 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -417,24 +417,39 @@ Expected<MIRVocabulary> MIRVocabulary::createDummyVocabForTest(
}
//===----------------------------------------------------------------------===//
-// MIR2VecVocabLegacyAnalysis Implementation
+// MIR2VecVocabProvider and MIR2VecVocabLegacyAnalysis
//===----------------------------------------------------------------------===//
-char MIR2VecVocabLegacyAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
- "MIR2Vec Vocabulary Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
-INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
- "MIR2Vec Vocabulary Analysis", false, true)
+Expected<mir2vec::MIRVocabulary>
+MIR2VecVocabProvider::getVocabulary(const Module &M) {
+ VocabMap OpcVocab, CommonOperandVocab, PhyRegVocabMap, VirtRegVocabMap;
-StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
- return "MIR2Vec Vocabulary Analysis";
+ if (Error Err = readVocabulary(OpcVocab, CommonOperandVocab, PhyRegVocabMap,
+ VirtRegVocabMap))
+ return std::move(Err);
+
+ for (const auto &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ if (auto *MF = MMI.getMachineFunction(F)) {
+ auto &Subtarget = MF->getSubtarget();
+ if (const auto *TII = Subtarget.getInstrInfo())
+ if (const auto *TRI = Subtarget.getRegisterInfo())
+ return mir2vec::MIRVocabulary::create(
+ std::move(OpcVocab), std::move(CommonOperandVocab),
+ std::move(PhyRegVocabMap), std::move(VirtRegVocabMap), *TII, *TRI,
+ MF->getRegInfo());
+ }
+ }
+ return createStringError(errc::invalid_argument,
+ "No machine functions found in module");
}
-Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab,
- VocabMap &CommonOperandVocab,
- VocabMap &PhyRegVocabMap,
- VocabMap &VirtRegVocabMap) {
+Error MIR2VecVocabProvider::readVocabulary(VocabMap &OpcodeVocab,
+ VocabMap &CommonOperandVocab,
+ VocabMap &PhyRegVocabMap,
+ VocabMap &VirtRegVocabMap) {
if (VocabFile.empty())
return createStringError(
errc::invalid_argument,
@@ -483,49 +498,15 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab,
return Error::success();
}
-Expected<mir2vec::MIRVocabulary>
-MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
- if (Vocab.has_value())
- return std::move(Vocab.value());
-
- VocabMap OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap;
- if (Error Err =
- readVocabulary(OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap))
- return std::move(Err);
-
- // Get machine module info to access machine functions and target info
- MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-
- // Find first available machine function to get target instruction info
- for (const auto &F : M) {
- if (F.isDeclaration())
- continue;
-
- if (auto *MF = MMI.getMachineFunction(F)) {
- auto &Subtarget = MF->getSubtarget();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- if (!TII) {
- return createStringError(errc::invalid_argument,
- "No TargetInstrInfo available; cannot create "
- "MIR2Vec vocabulary");
- }
-
- const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
- if (!TRI) {
- return createStringError(errc::invalid_argument,
- "No TargetRegisterInfo available; cannot "
- "create MIR2Vec vocabulary");
- }
-
- return mir2vec::MIRVocabulary::create(
- std::move(OpcMap), std::move(CommonOperandMap), std::move(PhyRegMap),
- std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
- }
- }
+char MIR2VecVocabLegacyAnalysis::ID = 0;
+INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
+ "MIR2Vec Vocabulary Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
+ "MIR2Vec Vocabulary Analysis", false, true)
- // No machine functions available - return error
- return createStringError(errc::invalid_argument,
- "No machine functions found in module");
+StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
+ return "MIR2Vec Vocabulary Analysis";
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MachineBlockHashInfo.cpp b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp
new file mode 100644
index 0000000..c4d9c0f
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp
@@ -0,0 +1,115 @@
+//===- llvm/CodeGen/MachineBlockHashInfo.cpp---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Compute the hashes of basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockHashInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+uint64_t hashBlock(const MachineBasicBlock &MBB, bool HashOperands) {
+ uint64_t Hash = 0;
+ for (const MachineInstr &MI : MBB) {
+ if (MI.isMetaInstruction() || MI.isTerminator())
+ continue;
+ Hash = hashing::detail::hash_16_bytes(Hash, MI.getOpcode());
+ if (HashOperands) {
+ for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+ Hash =
+ hashing::detail::hash_16_bytes(Hash, hash_value(MI.getOperand(i)));
+ }
+ }
+ }
+ return Hash;
+}
+
+/// Fold a 64-bit integer to a 16-bit one.
+uint16_t fold_64_to_16(const uint64_t Value) {
+ uint16_t Res = static_cast<uint16_t>(Value);
+ Res ^= static_cast<uint16_t>(Value >> 16);
+ Res ^= static_cast<uint16_t>(Value >> 32);
+ Res ^= static_cast<uint16_t>(Value >> 48);
+ return Res;
+}
+
+INITIALIZE_PASS(MachineBlockHashInfo, "machine-block-hash",
+ "Machine Block Hash Analysis", true, true)
+
+char MachineBlockHashInfo::ID = 0;
+
+MachineBlockHashInfo::MachineBlockHashInfo() : MachineFunctionPass(ID) {
+ initializeMachineBlockHashInfoPass(*PassRegistry::getPassRegistry());
+}
+
+void MachineBlockHashInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+struct CollectHashInfo {
+ uint64_t Offset;
+ uint64_t OpcodeHash;
+ uint64_t InstrHash;
+ uint64_t NeighborHash;
+};
+
+bool MachineBlockHashInfo::runOnMachineFunction(MachineFunction &F) {
+ DenseMap<const MachineBasicBlock *, CollectHashInfo> HashInfos;
+ uint16_t Offset = 0;
+ // Initialize hash components
+ for (const MachineBasicBlock &MBB : F) {
+ // offset of the machine basic block
+ HashInfos[&MBB].Offset = Offset;
+ Offset += MBB.size();
+ // Hashing opcodes
+ HashInfos[&MBB].OpcodeHash = hashBlock(MBB, /*HashOperands=*/false);
+ // Hash complete instructions
+ HashInfos[&MBB].InstrHash = hashBlock(MBB, /*HashOperands=*/true);
+ }
+
+ // Initialize neighbor hash
+ for (const MachineBasicBlock &MBB : F) {
+ uint64_t Hash = HashInfos[&MBB].OpcodeHash;
+ // Append hashes of successors
+ for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+ uint64_t SuccHash = HashInfos[SuccMBB].OpcodeHash;
+ Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
+ }
+ // Append hashes of predecessors
+ for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
+ uint64_t PredHash = HashInfos[PredMBB].OpcodeHash;
+ Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
+ }
+ HashInfos[&MBB].NeighborHash = Hash;
+ }
+
+ // Assign hashes
+ for (const MachineBasicBlock &MBB : F) {
+ const auto &HashInfo = HashInfos[&MBB];
+ BlendedBlockHash BlendedHash(fold_64_to_16(HashInfo.Offset),
+ fold_64_to_16(HashInfo.OpcodeHash),
+ fold_64_to_16(HashInfo.InstrHash),
+ fold_64_to_16(HashInfo.NeighborHash));
+ MBBHashInfo[&MBB] = BlendedHash.combine();
+ }
+
+ return false;
+}
+
+uint64_t MachineBlockHashInfo::getMBBHash(const MachineBasicBlock &MBB) {
+ return MBBHashInfo[&MBB];
+}
+
+MachineFunctionPass *llvm::createMachineBlockHashInfoPass() {
+ return new MachineBlockHashInfo();
+}
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index f54e2f2..620d3d3 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -593,7 +593,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
case Intrinsic::log:
Changed |= forEachCall(F, [&](CallInst *CI) {
Type *Ty = CI->getArgOperand(0)->getType();
- if (!isa<ScalableVectorType>(Ty))
+ if (!TM || !isa<ScalableVectorType>(Ty))
return false;
const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 603dc34..9656a30 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -890,6 +890,7 @@ private:
SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N);
SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+ SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N);
SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
SDValue ScalarizeVecRes_VSELECT(SDNode *N);
SDValue ScalarizeVecRes_SELECT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3b5f83f..bb4a8d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -69,6 +69,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
R = ScalarizeVecRes_UnaryOpWithExtraInput(N);
break;
case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+ case ISD::ATOMIC_LOAD:
+ R = ScalarizeVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
+ break;
case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
@@ -475,6 +478,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
return Op;
}
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
+ SDValue Result = DAG.getAtomicLoad(
+ N->getExtensionType(), SDLoc(N), N->getMemoryVT().getVectorElementType(),
+ N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(),
+ N->getMemOperand());
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+ return Result;
+}
+
SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
assert(N->isUnindexed() && "Indexed vector load?");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 90edaf3..379242e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8620,7 +8620,10 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
if (TLI->shouldPreservePtrArith(this->getMachineFunction().getFunction(),
BasePtrVT))
return getNode(ISD::PTRADD, DL, BasePtrVT, Ptr, Offset, Flags);
- return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, Flags);
+ // InBounds only applies to PTRADD, don't set it if we generate ADD.
+ SDNodeFlags AddFlags = Flags;
+ AddFlags.setInBounds(false);
+ return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, AddFlags);
}
/// Returns true if memcpy source is constant data.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df3..bfa566a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3131,12 +3131,16 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
if (TLI.useLoadStackGuardNode(M)) {
Guard = getLoadStackGuard(DAG, dl, Chain);
} else {
- const Value *IRGuard = TLI.getSDagStackGuard(M);
- SDValue GuardPtr = getValue(IRGuard);
-
- Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
- MachinePointerInfo(IRGuard, 0), Align,
- MachineMemOperand::MOVolatile);
+ if (const Value *IRGuard = TLI.getSDagStackGuard(M)) {
+ SDValue GuardPtr = getValue(IRGuard);
+ Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
+ MachinePointerInfo(IRGuard, 0), Align,
+ MachineMemOperand::MOVolatile);
+ } else {
+ LLVMContext &Ctx = *DAG.getContext();
+ Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+ Guard = DAG.getPOISON(PtrMemTy);
+ }
}
// Perform the comparison via a getsetcc.
@@ -4386,6 +4390,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
if (NW.hasNoUnsignedWrap() ||
(int64_t(Offset) >= 0 && NW.hasNoUnsignedSignedWrap()))
Flags |= SDNodeFlags::NoUnsignedWrap;
+ Flags.setInBounds(NW.isInBounds());
N = DAG.getMemBasePlusOffset(
N, DAG.getConstant(Offset, dl, N.getValueType()), dl, Flags);
@@ -4429,6 +4434,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
if (NW.hasNoUnsignedWrap() ||
(Offs.isNonNegative() && NW.hasNoUnsignedSignedWrap()))
Flags.setNoUnsignedWrap(true);
+ Flags.setInBounds(NW.isInBounds());
OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType());
@@ -4498,6 +4504,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
// pointer index type (add nuw).
SDNodeFlags AddFlags;
AddFlags.setNoUnsignedWrap(NW.hasNoUnsignedWrap());
+ AddFlags.setInBounds(NW.isInBounds());
N = DAG.getMemBasePlusOffset(N, IdxN, dl, AddFlags);
}
@@ -7324,6 +7331,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
Res = DAG.getPtrExtOrTrunc(Res, sdl, PtrTy);
} else {
const Value *Global = TLI.getSDagStackGuard(M);
+ if (!Global) {
+ LLVMContext &Ctx = *DAG.getContext();
+ Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+ setValue(&I, DAG.getPOISON(PtrTy));
+ return;
+ }
+
Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType());
Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
MachinePointerInfo(Global, 0), Align,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 39cbfad..77377d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -689,6 +689,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
if (getFlags().hasSameSign())
OS << " samesign";
+ if (getFlags().hasInBounds())
+ OS << " inbounds";
+
if (getFlags().hasNonNeg())
OS << " nneg";
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index b6169e6..10b7238 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -272,6 +272,12 @@ static cl::opt<bool>
cl::desc("Split static data sections into hot and cold "
"sections using profile information"));
+cl::opt<bool> EmitBBHash(
+ "emit-bb-hash",
+ cl::desc(
+ "Emit the hash of basic block in the SHT_LLVM_BB_ADDR_MAP section."),
+ cl::init(false), cl::Optional);
+
/// Allow standard passes to be disabled by command line options. This supports
/// simple binary flags that either suppress the pass or do nothing.
/// i.e. -disable-mypass=false has no effect.
@@ -1281,6 +1287,8 @@ void TargetPassConfig::addMachinePasses() {
// address map (or both).
if (TM->getBBSectionsType() != llvm::BasicBlockSection::None ||
TM->Options.BBAddrMap) {
+ if (EmitBBHash)
+ addPass(llvm::createMachineBlockHashInfoPass());
if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass(
TM->getBBSectionsFuncListBuf()));
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 6c7e27e..fa04976 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -247,7 +247,7 @@ public:
StandardSegments(std::move(StandardSegments)),
FinalizationSegments(std::move(FinalizationSegments)) {}
- ~IPInFlightAlloc() {
+ ~IPInFlightAlloc() override {
assert(!G && "InFlight alloc neither abandoned nor finalized");
}
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 75ae80f..4ceff48 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -38,7 +38,7 @@ public:
MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr)
: G(G), RegisterActionAddr(RegisterActionAddr) {}
- virtual ~MachODebugObjectSynthesizerBase() = default;
+ ~MachODebugObjectSynthesizerBase() override = default;
Error preserveDebugSections() {
if (G.findSectionByName(SynthDebugSectionName)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
index d1a6eaf..a2990ab 100644
--- a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
@@ -55,7 +55,7 @@ public:
Plugins = Layer.Plugins;
}
- ~JITLinkCtx() {
+ ~JITLinkCtx() override {
// If there is an object buffer return function then use it to
// return ownership of the buffer.
if (Layer.ReturnObjectBuffer && ObjBuffer)
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index fd805fbf..cdde733 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -92,7 +92,7 @@ public:
Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize),
Discard(Discard), Destroy(Destroy) {}
- ~OrcCAPIMaterializationUnit() {
+ ~OrcCAPIMaterializationUnit() override {
if (Ctx)
Destroy(Ctx);
}
@@ -264,7 +264,7 @@ public:
LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate)
: Dispose(Dispose), Ctx(Ctx), TryToGenerate(TryToGenerate) {}
- ~CAPIDefinitionGenerator() {
+ ~CAPIDefinitionGenerator() override {
if (Dispose)
Dispose(Ctx);
}
diff --git a/llvm/lib/Frontend/HLSL/CBuffer.cpp b/llvm/lib/Frontend/HLSL/CBuffer.cpp
index 407b6ad..1f53c87 100644
--- a/llvm/lib/Frontend/HLSL/CBuffer.cpp
+++ b/llvm/lib/Frontend/HLSL/CBuffer.cpp
@@ -43,8 +43,13 @@ std::optional<CBufferMetadata> CBufferMetadata::get(Module &M) {
for (const MDNode *MD : CBufMD->operands()) {
assert(MD->getNumOperands() && "Invalid cbuffer metadata");
- auto *Handle = cast<GlobalVariable>(
- cast<ValueAsMetadata>(MD->getOperand(0))->getValue());
+ // For an unused cbuffer, the handle may have been optimized out
+ Metadata *OpMD = MD->getOperand(0);
+ if (!OpMD)
+ continue;
+
+ auto *Handle =
+ cast<GlobalVariable>(cast<ValueAsMetadata>(OpMD)->getValue());
CBufferMapping &Mapping = Result->Mappings.emplace_back(Handle);
for (int I = 1, E = MD->getNumOperands(); I < E; ++I) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 03da154..7917712 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4446,10 +4446,12 @@ void Verifier::visitLoadInst(LoadInst &LI) {
Check(LI.getOrdering() != AtomicOrdering::Release &&
LI.getOrdering() != AtomicOrdering::AcquireRelease,
"Load cannot have Release ordering", &LI);
- Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
- "atomic load operand must have integer, pointer, or floating point "
- "type!",
+ Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+ ElTy->getScalarType()->isFloatingPointTy(),
+ "atomic load operand must have integer, pointer, floating point, "
+ "or vector type!",
ElTy, &LI);
+
checkAtomicMemAccessSize(ElTy, &LI);
} else {
Check(LI.getSyncScopeID() == SyncScope::System,
@@ -4472,9 +4474,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
Check(SI.getOrdering() != AtomicOrdering::Acquire &&
SI.getOrdering() != AtomicOrdering::AcquireRelease,
"Store cannot have Acquire ordering", &SI);
- Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
- "atomic store operand must have integer, pointer, or floating point "
- "type!",
+ Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+ ElTy->getScalarType()->isFloatingPointTy(),
+ "atomic store operand must have integer, pointer, floating point, "
+ "or vector type!",
ElTy, &SI);
checkAtomicMemAccessSize(ElTy, &SI);
} else {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 72ae064..9d0fa11 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -477,6 +477,10 @@ static void thinLTOInternalizeAndPromoteGUID(
return !GlobalValue::isLocalLinkage(Summary->linkage());
});
+ // Before performing index-based internalization and promotion for this GUID,
+ // the local flag should be consistent with the summary list linkage types.
+ VI.verifyLocal();
+
for (auto &S : VI.getSummaryList()) {
// First see if we need to promote an internal value because it is not
// exported.
@@ -2220,6 +2224,7 @@ class OutOfProcessThinBackend : public CGThinBackend {
ArrayRef<StringRef> DistributorArgs;
SString RemoteCompiler;
+ ArrayRef<StringRef> RemoteCompilerPrependArgs;
ArrayRef<StringRef> RemoteCompilerArgs;
bool SaveTemps;
@@ -2256,12 +2261,14 @@ public:
bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
StringRef LinkerOutputFile, StringRef Distributor,
ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
+ ArrayRef<StringRef> RemoteCompilerPrependArgs,
ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps)
: CGThinBackend(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
AddStream, OnWrite, ShouldEmitIndexFiles,
ShouldEmitImportsFiles, ThinLTOParallelism),
LinkerOutputFile(LinkerOutputFile), DistributorPath(Distributor),
DistributorArgs(DistributorArgs), RemoteCompiler(RemoteCompiler),
+ RemoteCompilerPrependArgs(RemoteCompilerPrependArgs),
RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {}
virtual void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset,
@@ -2383,6 +2390,11 @@ public:
JOS.attributeArray("args", [&]() {
JOS.value(RemoteCompiler);
+ // Forward any supplied prepend options.
+ if (!RemoteCompilerPrependArgs.empty())
+ for (auto &A : RemoteCompilerPrependArgs)
+ JOS.value(A);
+
JOS.value("-c");
JOS.value(Saver.save("--target=" + Triple.str()));
@@ -2513,6 +2525,7 @@ ThinBackend lto::createOutOfProcessThinBackend(
bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
StringRef LinkerOutputFile, StringRef Distributor,
ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
+ ArrayRef<StringRef> RemoteCompilerPrependArgs,
ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps) {
auto Func =
[=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
@@ -2522,7 +2535,7 @@ ThinBackend lto::createOutOfProcessThinBackend(
Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
AddStream, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles,
LinkerOutputFile, Distributor, DistributorArgs, RemoteCompiler,
- RemoteCompilerArgs, SaveTemps);
+ RemoteCompilerPrependArgs, RemoteCompilerArgs, SaveTemps);
};
return ThinBackend(Func, Parallelism);
}
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index 98090d3..6670971 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -197,6 +197,8 @@ void MCAsmInfoELF::printSwitchToSection(const MCSection &Section,
OS << "llvm_jt_sizes";
else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
OS << "llvm_cfi_jump_table";
+ else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH)
+ OS << "llvm_call_graph";
else
OS << "0x" << Twine::utohexstr(Sec.Type);
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index aee3c3b..b2f5000 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -554,7 +554,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
CallGraphSection =
- Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0);
+ Ctx->getELFSection(".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, 0);
StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
@@ -1172,7 +1172,7 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const {
}
return Ctx->getELFSection(
- ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+ ".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, Flags, 0, GroupName,
/*IsComdat=*/true, ElfSec.getUniqueID(),
static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
}
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 6195355..1a3752f 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -637,6 +637,8 @@ EndStmt:
Type = ELF::SHT_LLVM_JT_SIZES;
else if (TypeName == "llvm_cfi_jump_table")
Type = ELF::SHT_LLVM_CFI_JUMP_TABLE;
+ else if (TypeName == "llvm_call_graph")
+ Type = ELF::SHT_LLVM_CALL_GRAPH;
else if (TypeName.getAsInteger(0, Type))
return TokError("unknown section type");
}
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index f256e7b..6da97f9 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -322,6 +322,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE)
+ STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CALL_GRAPH);
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME);
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index c3a27c9..f8a84b0 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -744,6 +744,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
ECase(SHT_LLVM_BB_ADDR_MAP);
ECase(SHT_LLVM_OFFLOADING);
ECase(SHT_LLVM_LTO);
+ ECase(SHT_LLVM_CALL_GRAPH);
ECase(SHT_GNU_SFRAME);
ECase(SHT_GNU_ATTRIBUTES);
ECase(SHT_GNU_HASH);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 048c58d..3c9a27a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -669,7 +669,14 @@ void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
FAM.registerPass([&] { return buildDefaultAAPipeline(); });
#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
- FAM.registerPass([&] { return CREATE_PASS; });
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CREATE_PASS)>, \
+ const TargetMachine &>) { \
+ if (TM) \
+ FAM.registerPass([&] { return CREATE_PASS; }); \
+ } else { \
+ FAM.registerPass([&] { return CREATE_PASS; }); \
+ }
#include "PassRegistry.def"
for (auto &C : FunctionAnalysisRegistrationCallbacks)
@@ -2038,6 +2045,14 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
}
#define FUNCTION_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CREATE_PASS)>, \
+ const TargetMachine &>) { \
+ if (!TM) \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS)); \
return Error::success(); \
}
@@ -2046,6 +2061,18 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
+ auto CreatePass = CREATE_PASS; \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CreatePass( \
+ Params.get()))>, \
+ const TargetMachine &, \
+ std::remove_reference_t<decltype(Params.get())>>) { \
+ if (!TM) { \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
+ } \
MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
return Error::success(); \
}
@@ -2152,6 +2179,14 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
}
#define FUNCTION_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CREATE_PASS)>, \
+ const TargetMachine &>) { \
+ if (!TM) \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS)); \
return Error::success(); \
}
@@ -2160,6 +2195,18 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
+ auto CreatePass = CREATE_PASS; \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CreatePass( \
+ Params.get()))>, \
+ const TargetMachine &, \
+ std::remove_reference_t<decltype(Params.get())>>) { \
+ if (!TM) { \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
+ } \
CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
return Error::success(); \
}
@@ -2239,6 +2286,14 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
// Now expand the basic registered passes from the .inc file.
#define FUNCTION_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CREATE_PASS)>, \
+ const TargetMachine &>) { \
+ if (!TM) \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
FPM.addPass(CREATE_PASS); \
return Error::success(); \
}
@@ -2247,14 +2302,34 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
+ auto CreatePass = CREATE_PASS; \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CreatePass( \
+ Params.get()))>, \
+ const TargetMachine &, \
+ std::remove_reference_t<decltype(Params.get())>>) { \
+ if (!TM) { \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
+ } \
FPM.addPass(CREATE_PASS(Params.get())); \
return Error::success(); \
}
#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
if (Name == "require<" NAME ">") { \
+ if constexpr (std::is_constructible_v< \
+ std::remove_reference_t<decltype(CREATE_PASS)>, \
+ const TargetMachine &>) { \
+ if (!TM) \
+ return make_error<StringError>( \
+ formatv("pass '{0}' requires TargetMachine", Name).str(), \
+ inconvertibleErrorCode()); \
+ } \
FPM.addPass( \
- RequireAnalysisPass< \
- std::remove_reference_t<decltype(CREATE_PASS)>, Function>()); \
+ RequireAnalysisPass<std::remove_reference_t<decltype(CREATE_PASS)>, \
+ Function>()); \
return Error::success(); \
} \
if (Name == "invalidate<" NAME ">") { \
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index a66b6e4..1853cdd 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -345,7 +345,7 @@ FUNCTION_ANALYSIS("aa", AAManager())
FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
FUNCTION_ANALYSIS("bb-sections-profile-reader",
- BasicBlockSectionsProfileReaderAnalysis(TM))
+ BasicBlockSectionsProfileReaderAnalysis(*TM))
FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -356,7 +356,7 @@ FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
FUNCTION_ANALYSIS("ephemerals", EphemeralValuesAnalysis())
FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
-FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(TM))
+FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(*TM))
FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
FUNCTION_ANALYSIS("last-run-tracking", LastRunTrackingAnalysis())
@@ -406,14 +406,14 @@ FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
FUNCTION_PASS("annotation-remarks", AnnotationRemarksPass())
FUNCTION_PASS("assume-builder", AssumeBuilderPass())
FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
-FUNCTION_PASS("atomic-expand", AtomicExpandPass(TM))
+FUNCTION_PASS("atomic-expand", AtomicExpandPass(*TM))
FUNCTION_PASS("bdce", BDCEPass())
FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
FUNCTION_PASS("callbr-prepare", CallBrPreparePass())
FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
FUNCTION_PASS("chr", ControlHeightReductionPass())
-FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM))
-FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(TM))
+FUNCTION_PASS("codegenprepare", CodeGenPreparePass(*TM))
+FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(*TM))
FUNCTION_PASS("consthoist", ConstantHoistingPass())
FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
FUNCTION_PASS("coro-elide", CoroElidePass())
@@ -430,10 +430,10 @@ FUNCTION_PASS("dot-dom-only", DomOnlyPrinter())
FUNCTION_PASS("dot-post-dom", PostDomPrinter())
FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
FUNCTION_PASS("dse", DSEPass())
-FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
+FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM))
FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass())
-FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
+FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM))
+FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM))
FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
FUNCTION_PASS("extra-vector-passes",
ExtraFunctionPassManager<ShouldRunExtraVectorPasses>())
@@ -446,15 +446,15 @@ FUNCTION_PASS("guard-widening", GuardWideningPass())
FUNCTION_PASS("gvn-hoist", GVNHoistPass())
FUNCTION_PASS("gvn-sink", GVNSinkPass())
FUNCTION_PASS("helloworld", HelloWorldPass())
-FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(TM))
+FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM))
FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
FUNCTION_PASS("infer-alignment", InferAlignmentPass())
FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
FUNCTION_PASS("instcount", InstCountPass())
FUNCTION_PASS("instnamer", InstructionNamerPass())
FUNCTION_PASS("instsimplify", InstSimplifyPass())
-FUNCTION_PASS("interleaved-access", InterleavedAccessPass(TM))
-FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM))
+FUNCTION_PASS("interleaved-access", InterleavedAccessPass(*TM))
+FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(*TM))
FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
FUNCTION_PASS("irce", IRCEPass())
FUNCTION_PASS("jump-threading", JumpThreadingPass())
@@ -533,25 +533,25 @@ FUNCTION_PASS("reassociate", ReassociatePass())
FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
FUNCTION_PASS("reg2mem", RegToMemPass())
-FUNCTION_PASS("safe-stack", SafeStackPass(TM))
+FUNCTION_PASS("safe-stack", SafeStackPass(*TM))
FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass())
FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass())
FUNCTION_PASS("sccp", SCCPPass())
-FUNCTION_PASS("select-optimize", SelectOptimizePass(TM))
+FUNCTION_PASS("select-optimize", SelectOptimizePass(*TM))
FUNCTION_PASS("separate-const-offset-from-gep",
SeparateConstOffsetFromGEPPass())
FUNCTION_PASS("sink", SinkingPass())
FUNCTION_PASS("sjlj-eh-prepare", SjLjEHPreparePass(TM))
FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
FUNCTION_PASS("slsr", StraightLineStrengthReducePass())
-FUNCTION_PASS("stack-protector", StackProtectorPass(TM))
+FUNCTION_PASS("stack-protector", StackProtectorPass(*TM))
FUNCTION_PASS("strip-gc-relocates", StripGCRelocates())
FUNCTION_PASS("tailcallelim", TailCallElimPass())
FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
FUNCTION_PASS("tsan", ThreadSanitizerPass())
-FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
+FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
FUNCTION_PASS("vector-combine", VectorCombinePass())
@@ -730,7 +730,7 @@ FUNCTION_PASS_WITH_PARAMS(
FUNCTION_PASS_WITH_PARAMS(
"expand-fp", "ExpandFpPass",
[TM = TM](CodeGenOptLevel OL) {
- return ExpandFpPass(TM, OL);
+ return ExpandFpPass(*TM, OL);
},
parseExpandFpOptions, "O0;O1;O2;O3")
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47d..2715229 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -132,24 +132,70 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
return std::move(SubPatterns);
}
+static StringRef maxPlainSubstring(StringRef S) {
+ StringRef Best;
+ while (!S.empty()) {
+ size_t PrefixSize = S.find_first_of("?*[{\\");
+ if (PrefixSize == std::string::npos)
+ PrefixSize = S.size();
+
+ if (Best.size() < PrefixSize)
+ Best = S.take_front(PrefixSize);
+
+ S = S.drop_front(PrefixSize);
+
+ // It's impossible, as the first and last characters of the input string
+ // must be Glob special characters, otherwise they would be parts of
+ // the prefix or the suffix.
+ assert(!S.empty());
+
+ switch (S.front()) {
+ case '\\':
+ S = S.drop_front(2);
+ break;
+ case '[': {
+ // Drop '[' and the first character which can be ']'.
+ S = S.drop_front(2);
+ size_t EndBracket = S.find_first_of("]");
+ // Should not be possible, SubGlobPattern::create should fail on invalid
+ // pattern before we get here.
+ assert(EndBracket != std::string::npos);
+ S = S.drop_front(EndBracket + 1);
+ break;
+ }
+ case '{':
+ // TODO: implement.
+ // Fallback to whatever is best for now.
+ return Best;
+ default:
+ S = S.drop_front(1);
+ }
+ }
+
+ return Best;
+}
+
Expected<GlobPattern>
GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
GlobPattern Pat;
+ Pat.Pattern = S;
// Store the prefix that does not contain any metacharacter.
- size_t PrefixSize = S.find_first_of("?*[{\\");
- Pat.Prefix = S.substr(0, PrefixSize);
- if (PrefixSize == std::string::npos)
+ Pat.PrefixSize = S.find_first_of("?*[{\\");
+ if (Pat.PrefixSize == std::string::npos) {
+ Pat.PrefixSize = S.size();
return Pat;
- S = S.substr(PrefixSize);
+ }
+ S = S.substr(Pat.PrefixSize);
// Just in case we stop on unmatched opening brackets.
size_t SuffixStart = S.find_last_of("?*[]{}\\");
assert(SuffixStart != std::string::npos);
if (S[SuffixStart] == '\\')
++SuffixStart;
- ++SuffixStart;
- Pat.Suffix = S.substr(SuffixStart);
+ if (SuffixStart < S.size())
+ ++SuffixStart;
+ Pat.SuffixSize = S.size() - SuffixStart;
S = S.substr(0, SuffixStart);
SmallVector<std::string, 1> SubPats;
@@ -199,10 +245,15 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
return Pat;
}
+StringRef GlobPattern::longest_substr() const {
+ return maxPlainSubstring(
+ Pattern.drop_front(PrefixSize).drop_back(SuffixSize));
+}
+
bool GlobPattern::match(StringRef S) const {
- if (!S.consume_front(Prefix))
+ if (!S.consume_front(prefix()))
return false;
- if (!S.consume_back(Suffix))
+ if (!S.consume_back(suffix()))
return false;
if (SubGlobs.empty() && S.empty())
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c75..92f260f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4005,24 +4005,20 @@ def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
// load zero-extended i32, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-
+def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
// load zero-extended i16, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-
+def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
// load zero-extended i8, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-
+def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
// load zero-extended i16, bitcast to f32
-def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
- (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-
+def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
// load zero-extended i8, bitcast to f32
-def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
- (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e3370d3..2053fc4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1577,18 +1577,26 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
}
static bool isAllActivePredicate(Value *Pred) {
- // Look through convert.from.svbool(convert.to.svbool(...) chain.
Value *UncastedPred;
+
+ // Look through predicate casts that only remove lanes.
if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
- m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
- m_Value(UncastedPred)))))
- // If the predicate has the same or less lanes than the uncasted
- // predicate then we know the casting has no effect.
- if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
- cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
- Pred = UncastedPred;
+ m_Value(UncastedPred)))) {
+ auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
+ Pred = UncastedPred;
+
+ if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
+ m_Value(UncastedPred))))
+ // If the predicate has the same or less lanes than the uncasted predicate
+ // then we know the casting has no effect.
+ if (OrigPredTy->getMinNumElements() <=
+ cast<ScalableVectorType>(UncastedPred->getType())
+ ->getMinNumElements())
+ Pred = UncastedPred;
+ }
+
auto *C = dyn_cast<Constant>(Pred);
- return (C && C->isAllOnesValue());
+ return C && C->isAllOnesValue();
}
// Simplify `V` by only considering the operations that affect active lanes.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea32748..1c8383c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
"true", "Has v_mad_u32 instruction">;
+def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts",
+ "HasAddMinMaxInsts",
+ "true",
+ "Has v_add_{min|max}_{i|u}32 instructions"
+>;
+
+def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
+ "HasPkAddMinMaxInsts",
+ "true",
+ "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
FeatureMadU32Inst,
+ FeatureAddMinMaxInsts,
+ FeaturePkAddMinMaxInsts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
Feature45BitNumRecordsBufferResource,
@@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts :
def HasAddMinMaxInsts :
Predicate<"Subtarget->hasAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>;
def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>;
def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a4..54ba2f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_perm_pk16_b4_u4:
case Intrinsic::amdgcn_perm_pk16_b6_u4:
case Intrinsic::amdgcn_perm_pk16_b8_u4:
+ case Intrinsic::amdgcn_add_max_i32:
+ case Intrinsic::amdgcn_add_max_u32:
+ case Intrinsic::amdgcn_add_min_i32:
+ case Intrinsic::amdgcn_add_min_u32:
+ case Intrinsic::amdgcn_pk_add_max_i16:
+ case Intrinsic::amdgcn_pk_add_max_u16:
+ case Intrinsic::amdgcn_pk_add_min_i16:
+ case Intrinsic::amdgcn_pk_add_min_u16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 996b55f..02c5390 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2086,7 +2086,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
- addPass(AtomicExpandPass(&TM));
+ addPass(AtomicExpandPass(TM));
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(AMDGPUPromoteAllocaPass(TM));
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a466780..ac660d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -277,6 +277,8 @@ protected:
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
bool HasMadU32Inst = false;
+ bool HasAddMinMaxInsts = false;
+ bool HasPkAddMinMaxInsts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -1567,10 +1569,10 @@ public:
bool hasIntMinMax64() const { return GFX1250Insts; }
// \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
- bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
- bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7cce033..ee10190 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
- defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
}
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6500fce..c4692b7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
!if (P.HasModifiers,
- getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+ getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
@@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
- let HasModifiers = 0;
+ let HasNeg = 0;
+ let EnableClamp = 1;
}
let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 3368a50..36b9908 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1471,6 +1471,435 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
// instructions) auto-generated.
#include "ARMGenMCPseudoLowering.inc"
+// Helper function to check if a register is live (used as an implicit operand)
+// in the given call instruction.
+static bool isRegisterLiveInCall(const MachineInstr &Call, MCRegister Reg) {
+ for (const MachineOperand &MO : Call.implicit_operands()) {
+ if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type,
+ const MachineInstr &Call,
+ int64_t PrefixNops) {
+ // Choose scratch register: r12 primary, r3 if target is r12.
+ unsigned ScratchReg = ARM::R12;
+ if (AddrReg == ARM::R12) {
+ ScratchReg = ARM::R3;
+ }
+
+ // Calculate ESR for ARM mode (16-bit): 0x8000 | (scratch_reg << 5) | addr_reg
+ // Note: scratch_reg is always 0x1F since the EOR sequence clobbers it.
+ const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ unsigned AddrIndex = TRI->getEncodingValue(AddrReg);
+ unsigned ESR = 0x8000 | (31 << 5) | (AddrIndex & 31);
+
+ // Check if r3 is live and needs to be spilled.
+ bool NeedSpillR3 =
+ (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3);
+
+ // If we need to spill r3, push it first.
+ if (NeedSpillR3) {
+ // push {r3}
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STMDB_UPD)
+ .addReg(ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(ARM::R3));
+ }
+
+ // Clear bit 0 of target address to handle Thumb function pointers.
+ // In 32-bit ARM, function pointers may have the low bit set to indicate
+ // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later).
+ // We need to clear it to avoid an alignment fault when loading.
+ // bic scratch, target, #1
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BICri)
+ .addReg(ScratchReg)
+ .addReg(AddrReg)
+ .addImm(1)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(0));
+
+ // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)]
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(-(PrefixNops * 4 + 4))
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // Each EOR instruction XORs one byte of the type, shifted to its position.
+ for (int i = 0; i < 4; i++) {
+ uint8_t byte = (Type >> (i * 8)) & 0xFF;
+ uint32_t imm = byte << (i * 8);
+ bool isLast = (i == 3);
+
+ // Encode as ARM modified immediate.
+ int SOImmVal = ARM_AM::getSOImmVal(imm);
+ assert(SOImmVal != -1 &&
+ "Cannot encode immediate as ARM modified immediate");
+
+ // eor[s] scratch, scratch, #imm (last one sets flags with CPSR)
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(ARM::EORri)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(SOImmVal)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(isLast ? ARM::CPSR : ARM::NoRegister));
+ }
+
+ // If we spilled r3, restore it immediately after the comparison.
+ // This must happen before the branch so r3 is valid on both paths.
+ if (NeedSpillR3) {
+ // pop {r3}
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDMIA_UPD)
+ .addReg(ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(ARM::R3));
+ }
+
+ // beq .Lpass (branch if types match, i.e., scratch is zero)
+ MCSymbol *Pass = OutContext.createTempSymbol();
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(ARM::Bcc)
+ .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR));
+
+ // udf #ESR (trap with encoded diagnostic)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::UDF).addImm(ESR));
+
+ OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type,
+ const MachineInstr &Call,
+ int64_t PrefixNops) {
+ // Choose scratch register: r12 primary, r3 if target is r12.
+ unsigned ScratchReg = ARM::R12;
+ if (AddrReg == ARM::R12) {
+ ScratchReg = ARM::R3;
+ }
+
+ // Calculate ESR for Thumb mode (8-bit): 0x80 | addr_reg
+ // Bit 7: KCFI trap indicator
+ // Bits 6-5: Reserved
+ // Bits 4-0: Address register encoding
+ const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ unsigned AddrIndex = TRI->getEncodingValue(AddrReg);
+ unsigned ESR = 0x80 | (AddrIndex & 0x1F);
+
+ // Check if r3 is live and needs to be spilled.
+ bool NeedSpillR3 =
+ (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3);
+
+ // If we need to spill r3, push it first.
+ if (NeedSpillR3) {
+ // push {r3}
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+ }
+
+ // Clear bit 0 of target address to handle Thumb function pointers.
+ // In 32-bit ARM, function pointers may have the low bit set to indicate
+ // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later).
+ // We need to clear it to avoid an alignment fault when loading.
+ // bic scratch, target, #1
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2BICri)
+ .addReg(ScratchReg)
+ .addReg(AddrReg)
+ .addImm(1)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(0));
+
+ // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)]
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi8)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(-(PrefixNops * 4 + 4))
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // Each EOR instruction XORs one byte of the type, shifted to its position.
+ for (int i = 0; i < 4; i++) {
+ uint8_t byte = (Type >> (i * 8)) & 0xFF;
+ uint32_t imm = byte << (i * 8);
+ bool isLast = (i == 3);
+
+ // Verify the immediate can be encoded as Thumb2 modified immediate.
+ assert(ARM_AM::getT2SOImmVal(imm) != -1 &&
+ "Cannot encode immediate as Thumb2 modified immediate");
+
+ // eor[s] scratch, scratch, #imm (last one sets flags with CPSR)
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(ARM::t2EORri)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(imm)
+ .addImm(ARMCC::AL)
+ .addReg(0)
+ .addReg(isLast ? ARM::CPSR : ARM::NoRegister));
+ }
+
+ // If we spilled r3, restore it immediately after the comparison.
+ // This must happen before the branch so r3 is valid on both paths.
+ if (NeedSpillR3) {
+ // pop {r3}
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+ }
+
+ // beq .Lpass (branch if types match, i.e., scratch is zero)
+ MCSymbol *Pass = OutContext.createTempSymbol();
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(ARM::t2Bcc)
+ .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR));
+
+ // udf #ESR (trap with encoded diagnostic)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tUDF).addImm(ESR));
+
+ OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type,
+ const MachineInstr &Call,
+ int64_t PrefixNops) {
+ // For Thumb1, use R2 unconditionally as scratch register (a low register
+ // required for tLDRi). R3 is used for building the type hash.
+ unsigned ScratchReg = ARM::R2;
+ unsigned TempReg = ARM::R3;
+
+ // Check if r3 is live and needs to be spilled.
+ bool NeedSpillR3 = isRegisterLiveInCall(Call, ARM::R3);
+
+ // Spill r3 if needed
+ if (NeedSpillR3) {
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+ }
+
+ // Check if r2 is live and needs to be spilled.
+ bool NeedSpillR2 = isRegisterLiveInCall(Call, ARM::R2);
+
+ // Push R2 if it's live
+ if (NeedSpillR2) {
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2));
+ }
+
+ // Clear bit 0 from target address
+ // TempReg (R3) is used first as helper for BIC, then later for building type
+ // hash.
+
+ // movs temp, #1
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addImm(1)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // mov scratch, target
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+ .addReg(ScratchReg)
+ .addReg(AddrReg)
+ .addImm(ARMCC::AL));
+
+ // bics scratch, temp (scratch = scratch & ~temp)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBIC)
+ .addReg(ScratchReg)
+ .addReg(ARM::CPSR)
+ .addReg(ScratchReg)
+ .addReg(TempReg)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // Load type hash. Thumb1 doesn't support negative offsets, so subtract.
+ int offset = PrefixNops * 4 + 4;
+
+ // subs scratch, #offset
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSUBi8)
+ .addReg(ScratchReg)
+ .addReg(ARM::CPSR)
+ .addReg(ScratchReg)
+ .addImm(offset)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // ldr scratch, [scratch, #0]
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(0)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // Load expected type inline (instead of EOR sequence)
+ //
+ // This creates the 32-bit value byte-by-byte in the temp register:
+ // movs temp, #byte3 (high byte)
+ // lsls temp, temp, #8
+ // adds temp, #byte2
+ // lsls temp, temp, #8
+ // adds temp, #byte1
+ // lsls temp, temp, #8
+ // adds temp, #byte0 (low byte)
+
+ uint8_t byte0 = (Type >> 0) & 0xFF;
+ uint8_t byte1 = (Type >> 8) & 0xFF;
+ uint8_t byte2 = (Type >> 16) & 0xFF;
+ uint8_t byte3 = (Type >> 24) & 0xFF;
+
+ // movs temp, #byte3 (start with high byte)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addImm(byte3)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // lsls temp, temp, #8
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(8)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // adds temp, #byte2
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(byte2)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // lsls temp, temp, #8
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(8)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // adds temp, #byte1
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(byte1)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // lsls temp, temp, #8
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(8)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // adds temp, #byte0 (low byte)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+ .addReg(TempReg)
+ .addReg(ARM::CPSR)
+ .addReg(TempReg)
+ .addImm(byte0)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // cmp scratch, temp
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tCMPr)
+ .addReg(ScratchReg)
+ .addReg(TempReg)
+ .addImm(ARMCC::AL)
+ .addReg(0));
+
+ // Restore registers if spilled (pop in reverse order of push: R2, then R3)
+ if (NeedSpillR2) {
+ // pop {r2}
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2));
+ }
+
+ // Restore r3 if spilled
+ if (NeedSpillR3) {
+ // pop {r3}
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+ }
+
+ // beq .Lpass (branch if types match, i.e., scratch == temp)
+ MCSymbol *Pass = OutContext.createTempSymbol();
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(ARM::tBcc)
+ .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR));
+
+ // bkpt #0 (trap with encoded diagnostic)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBKPT).addImm(0));
+
+ OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::LowerKCFI_CHECK(const MachineInstr &MI) {
+ Register AddrReg = MI.getOperand(0).getReg();
+ const int64_t Type = MI.getOperand(1).getImm();
+
+ // Get the call instruction that follows this KCFI_CHECK.
+ assert(std::next(MI.getIterator())->isCall() &&
+ "KCFI_CHECK not followed by a call instruction");
+ const MachineInstr &Call = *std::next(MI.getIterator());
+
+ // Adjust the offset for patchable-function-prefix.
+ int64_t PrefixNops = 0;
+ MI.getMF()
+ ->getFunction()
+ .getFnAttribute("patchable-function-prefix")
+ .getValueAsString()
+ .getAsInteger(10, PrefixNops);
+
+ // Emit the appropriate instruction sequence based on the opcode variant.
+ switch (MI.getOpcode()) {
+ case ARM::KCFI_CHECK_ARM:
+ EmitKCFI_CHECK_ARM32(AddrReg, Type, Call, PrefixNops);
+ break;
+ case ARM::KCFI_CHECK_Thumb2:
+ EmitKCFI_CHECK_Thumb2(AddrReg, Type, Call, PrefixNops);
+ break;
+ case ARM::KCFI_CHECK_Thumb1:
+ EmitKCFI_CHECK_Thumb1(AddrReg, Type, Call, PrefixNops);
+ break;
+ default:
+ llvm_unreachable("Unexpected KCFI_CHECK opcode");
+ }
+}
+
void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
ARM_MC::verifyInstructionPredicates(MI->getOpcode(),
getSubtargetInfo().getFeatureBits());
@@ -1504,6 +1933,11 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
switch (Opc) {
case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
+ case ARM::KCFI_CHECK_ARM:
+ case ARM::KCFI_CHECK_Thumb2:
+ case ARM::KCFI_CHECK_Thumb1:
+ LowerKCFI_CHECK(*MI);
+ return;
case ARM::LEApcrel:
case ARM::tLEApcrel:
case ARM::t2LEApcrel: {
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 2b067c7..9e92b5a 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -123,9 +123,20 @@ public:
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+ // KCFI check lowering
+ void LowerKCFI_CHECK(const MachineInstr &MI);
+
private:
void EmitSled(const MachineInstr &MI, SledKind Kind);
+ // KCFI check emission helpers
+ void EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type,
+ const MachineInstr &Call, int64_t PrefixNops);
+ void EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type,
+ const MachineInstr &Call, int64_t PrefixNops);
+ void EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type,
+ const MachineInstr &Call, int64_t PrefixNops);
+
// Helpers for emitStartOfAsmFile() and emitEndOfAsmFile()
void emitAttributes();
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 0d7b6d1..fffb6373 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2301,6 +2301,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
NewMI->addOperand(MBBI->getOperand(i));
+ NewMI->setCFIType(*MBB.getParent(), MI.getCFIType());
+
// Update call info and delete the pseudo instruction TCRETURN.
if (MI.isCandidateForAdditionalCallInfo())
MI.getMF()->moveAdditionalCallInfo(&MI, &*NewMI);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b1a668e..8122db2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2849,6 +2849,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (isTailCall) {
MF.getFrameInfo().setHasTailCall();
SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
+ if (CLI.CFIType)
+ Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
@@ -2856,6 +2858,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
+ if (CLI.CFIType)
+ Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InGlue = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -12008,6 +12012,71 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
.add(predOps(ARMCC::AL));
}
+bool ARMTargetLowering::supportKCFIBundles() const {
+ // KCFI is supported in all ARM/Thumb modes
+ return true;
+}
+
+MachineInstr *
+ARMTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &MBBI,
+ const TargetInstrInfo *TII) const {
+ assert(MBBI->isCall() && MBBI->getCFIType() &&
+ "Invalid call instruction for a KCFI check");
+
+ MachineOperand *TargetOp = nullptr;
+ switch (MBBI->getOpcode()) {
+ // ARM mode opcodes
+ case ARM::BLX:
+ case ARM::BLX_pred:
+ case ARM::BLX_noip:
+ case ARM::BLX_pred_noip:
+ case ARM::BX_CALL:
+ TargetOp = &MBBI->getOperand(0);
+ break;
+ case ARM::TCRETURNri:
+ case ARM::TCRETURNrinotr12:
+ case ARM::TAILJMPr:
+ case ARM::TAILJMPr4:
+ TargetOp = &MBBI->getOperand(0);
+ break;
+ // Thumb mode opcodes (Thumb1 and Thumb2)
+ // Note: Most Thumb call instructions have predicate operands before the
+ // target register Format: tBLXr pred, predreg, target_register, ...
+ case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
+ case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
+ case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
+ TargetOp = &MBBI->getOperand(2);
+ break;
+ // Tail call instructions don't have predicates, target is operand 0
+ case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
+ TargetOp = &MBBI->getOperand(0);
+ break;
+ default:
+ llvm_unreachable("Unexpected CFI call opcode");
+ }
+
+ assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
+ TargetOp->setIsRenamable(false);
+
+ // Select the appropriate KCFI_CHECK variant based on the instruction set
+ unsigned KCFICheckOpcode;
+ if (Subtarget->isThumb()) {
+ if (Subtarget->isThumb2()) {
+ KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
+ } else {
+ KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
+ }
+ } else {
+ KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
+ }
+
+ return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
+ .addReg(TargetOp->getReg())
+ .addImm(MBBI->getCFIType())
+ .getInstr();
+}
+
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001..8c5e0cf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -447,6 +447,12 @@ class VectorType;
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
+ bool supportKCFIBundles() const override;
+
+ MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &MBBI,
+ const TargetInstrInfo *TII) const override;
+
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 282ff53..53be167 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -6536,6 +6536,36 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPRPair:$addr_temp_out),
def : Pat<(atomic_fence (timm), 0), (MEMBARRIER)>;
//===----------------------------------------------------------------------===//
+// KCFI check pseudo-instruction.
+//===----------------------------------------------------------------------===//
+// KCFI_CHECK pseudo-instruction for Kernel Control-Flow Integrity.
+// Expands to a sequence that verifies the function pointer's type hash.
+// Different sizes for different architectures due to different expansions.
+
+def KCFI_CHECK_ARM
+ : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+ Sched<[]>,
+ Requires<[IsARM]> {
+ let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf)
+}
+
+def KCFI_CHECK_Thumb2
+ : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+ Sched<[]>,
+ Requires<[IsThumb2]> {
+ let Size =
+ 32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf)
+}
+
+def KCFI_CHECK_Thumb1
+ : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+ Sched<[]>,
+ Requires<[IsThumb1Only]> {
+ let Size = 50; // worst-case 25 instructions (pushes, bic helper, type
+ // building, cmp, pops)
+}
+
+//===----------------------------------------------------------------------===//
// Instructions used for emitting unwind opcodes on Windows.
//===----------------------------------------------------------------------===//
let isPseudo = 1 in {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 86740a9..590d4c7 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -111,6 +111,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
initializeMVELaneInterleavingPass(Registry);
initializeARMFixCortexA57AES1742098Pass(Registry);
initializeARMDAGToDAGISelLegacyPass(Registry);
+ initializeKCFIPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -487,6 +488,9 @@ void ARMPassConfig::addPreSched2() {
// proper scheduling.
addPass(createARMExpandPseudoPass());
+ // Emit KCFI checks for indirect calls.
+ addPass(createKCFIPass());
+
if (getOptLevel() != CodeGenOptLevel::None) {
// When optimising for size, always run the Thumb2SizeReduction pass before
// IfConversion. Otherwise, check whether IT blocks are restricted
@@ -517,9 +521,12 @@ void ARMPassConfig::addPreSched2() {
void ARMPassConfig::addPreEmitPass() {
addPass(createThumb2SizeReductionPass());
- // Constant island pass work on unbundled instructions.
+ // Unpack bundles for:
+ // - Thumb2: Constant island pass requires unbundled instructions
+ // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter
addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
- return MF.getSubtarget<ARMSubtarget>().isThumb2();
+ return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
+ MF.getFunction().getParent()->getModuleFlag("kcfi");
}));
// Don't optimize barriers or block placement at -O0.
@@ -530,6 +537,7 @@ void ARMPassConfig::addPreEmitPass() {
}
void ARMPassConfig::addPreEmitPass2() {
+
// Inserts fixup instructions before unsafe AES operations. Instructions may
// be inserted at the start of blocks and at within blocks so this pass has to
// come before those below.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeaf..ca4a655 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2614,6 +2614,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
Subtarget)))
return Result;
+ // Try to widen vectors to gain more optimization opportunities.
+ if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG))
+ return NewShuffle;
if ((Result =
lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 22cf3a7..598735f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4675,7 +4675,7 @@ class WMMA_INSTR<string _Intr, list<dag> _Args>
//
class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride>
- : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record,
+ : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record_name,
[!con((ins ADDR:$src),
!if(WithStride, (ins B32:$ldm), (ins)))]>,
Requires<Frag.Predicates> {
@@ -4714,7 +4714,7 @@ class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride>
//
class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
bit WithStride>
- : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record,
+ : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record_name,
[!con((ins ADDR:$dst),
Frag.Ins,
!if(WithStride, (ins B32:$ldm), (ins)))]>,
@@ -4778,7 +4778,7 @@ class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
WMMA_REGINFO FragC, WMMA_REGINFO FragD,
string ALayout, string BLayout, int Satfinite, string rnd, string b1op>
- : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record,
+ : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record_name,
[FragA.Ins, FragB.Ins, FragC.Ins]>,
// Requires does not seem to have effect on Instruction w/o Patterns.
// We set it here anyways and propagate to the Pat<> we construct below.
@@ -4837,7 +4837,7 @@ defset list<WMMA_INSTR> WMMAs = {
class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
WMMA_REGINFO FragC, WMMA_REGINFO FragD,
string ALayout, string BLayout, int Satfinite, string b1op, string Kind>
- : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record,
+ : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record_name,
[FragA.Ins, FragB.Ins, FragC.Ins]>,
// Requires does not seem to have effect on Instruction w/o Patterns.
// We set it here anyways and propagate to the Pat<> we construct below.
@@ -4891,7 +4891,7 @@ class MMA_SP<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
WMMA_REGINFO FragC, WMMA_REGINFO FragD,
string Metadata, string Kind, int Satfinite>
: WMMA_INSTR<MMA_SP_NAME<Metadata, Kind, Satfinite,
- FragA, FragB, FragC, FragD>.record,
+ FragA, FragB, FragC, FragD>.record_name,
[FragA.Ins, FragB.Ins, FragC.Ins,
(ins B32:$metadata, i32imm:$selector)]>,
// Requires does not seem to have effect on Instruction w/o Patterns.
@@ -4946,7 +4946,7 @@ defset list<WMMA_INSTR> MMA_SPs = {
// ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
//
class LDMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
- : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record, [(ins ADDR:$src)]>,
+ : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record_name, [(ins ADDR:$src)]>,
Requires<Frag.Predicates> {
// Build PatFrag that only matches particular address space.
PatFrag IntrFrag = PatFrag<(ops node:$src), (Intr node:$src),
@@ -4981,7 +4981,7 @@ defset list<WMMA_INSTR> LDMATRIXs = {
// stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
//
class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
- : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>,
+ : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record_name, [!con((ins ADDR:$dst), Frag.Ins)]>,
Requires<Frag.Predicates> {
// Build PatFrag that only matches particular address space.
dag PFOperands = !con((ops node:$dst),
@@ -5376,7 +5376,7 @@ class Tcgen05MMAInst<bit Sp, string KindStr, string ASpace,
Requires<PTXPredicates> {
Intrinsic Intrin = !cast<Intrinsic>(
- NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record
+ NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record_name
);
dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins));
@@ -5618,7 +5618,7 @@ class Tcgen05MMABlockScaleInst<bit Sp, string ASpace, string KindStr,
Requires<[hasTcgen05Instructions, PTXPredicate]> {
Intrinsic Intrin = !cast<Intrinsic>(
- NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record);
+ NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record_name);
dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins));
dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin i32:$spmetadata), (Intrin));
@@ -5702,7 +5702,7 @@ class Tcgen05MMAWSInst<bit Sp, string ASpace, string KindStr,
Requires<[hasTcgen05Instructions]> {
Intrinsic Intrin = !cast<Intrinsic>(
- NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record);
+ NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record_name);
dag ZeroColMaskIns = !if(!eq(HasZeroColMask, 1),
(ins B64:$zero_col_mask), (ins));
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 67b510d..f2b216b 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormatVariadic.h"
#define GET_GICOMBINER_DEPS
#include "RISCVGenPostLegalizeGICombiner.inc"
@@ -42,6 +43,56 @@ namespace {
#include "RISCVGenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES
+/// Match: G_STORE (G_FCONSTANT +0.0), addr
+/// Return the source vreg in MatchInfo if matched.
+bool matchFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ const RISCVSubtarget &STI, Register &MatchInfo) {
+ if (MI.getOpcode() != TargetOpcode::G_STORE)
+ return false;
+
+ Register SrcReg = MI.getOperand(0).getReg();
+ if (!SrcReg.isVirtual())
+ return false;
+
+ MachineInstr *Def = MRI.getVRegDef(SrcReg);
+ if (!Def || Def->getOpcode() != TargetOpcode::G_FCONSTANT)
+ return false;
+
+ auto *CFP = Def->getOperand(1).getFPImm();
+ if (!CFP || !CFP->getValueAPF().isPosZero())
+ return false;
+
+ unsigned ValBits = MRI.getType(SrcReg).getSizeInBits();
+ if ((ValBits == 16 && !STI.hasStdExtZfh()) ||
+ (ValBits == 32 && !STI.hasStdExtF()) ||
+ (ValBits == 64 && (!STI.hasStdExtD() || !STI.is64Bit())))
+ return false;
+
+ MatchInfo = SrcReg;
+ return true;
+}
+
+/// Apply: rewrite to G_STORE (G_CONSTANT 0 [XLEN]), addr
+void applyFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, const RISCVSubtarget &STI,
+ Register &MatchInfo) {
+ const unsigned XLen = STI.getXLen();
+
+ auto Zero = B.buildConstant(LLT::scalar(XLen), 0);
+ MI.getOperand(0).setReg(Zero.getReg(0));
+
+ MachineInstr *Def = MRI.getVRegDef(MatchInfo);
+ if (Def && MRI.use_nodbg_empty(MatchInfo))
+ Def->eraseFromParent();
+
+#ifndef NDEBUG
+ unsigned ValBits = MRI.getType(MatchInfo).getSizeInBits();
+ LLVM_DEBUG(dbgs() << formatv("[{0}] Fold FP zero store -> int zero "
+ "(XLEN={1}, ValBits={2}):\n {3}\n",
+ DEBUG_TYPE, XLen, ValBits, MI));
+#endif
+}
+
class RISCVPostLegalizerCombinerImpl : public Combiner {
protected:
const CombinerHelper Helper;
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 995dd0c..a06b60d 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -19,11 +19,20 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
"RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> {
}
+// Rule: fold store (fp +0.0) -> store (int zero [XLEN])
+def fp_zero_store_matchdata : GIDefMatchData<"Register">;
+def fold_fp_zero_store : GICombineRule<
+ (defs root:$root, fp_zero_store_matchdata:$matchinfo),
+ (match (G_STORE $src, $addr):$root,
+ [{ return matchFoldFPZeroStore(*${root}, MRI, STI, ${matchinfo}); }]),
+ (apply [{ applyFoldFPZeroStore(*${root}, MRI, B, STI, ${matchinfo}); }])>;
+
// Post-legalization combines which are primarily optimizations.
// TODO: Add more combines.
def RISCVPostLegalizerCombiner
: GICombiner<"RISCVPostLegalizerCombinerImpl",
[sub_to_add, combines_for_extload, redundant_and,
identity_combines, shift_immed_chain,
- commute_constant_to_rhs, simplify_neg_minmax]> {
+ commute_constant_to_rhs, simplify_neg_minmax,
+ fold_fp_zero_store]> {
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 4104abd..4c2f7f6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -482,7 +482,7 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in {
defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC;
}
-let Predicates = [HasVendorXSfvfnrclipxfqf] in {
+let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {
defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP;
defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index f7d1a09..b9c5b75 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -668,4 +668,38 @@ foreach vti = NoGroupBF16Vectors in {
def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
(vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
}
+
+let Predicates = [HasStdExtZvfbfa] in {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fwti.Vector (any_riscv_fpextend_vl
+ (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Mask VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+ (fvti.Mask VMV0:$vm),
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
+
+ def : Pat<(fvti.Vector (any_riscv_fpround_vl
+ (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask VMV0:$vm), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+ (fwti.Mask VMV0:$vm),
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
+ def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fwti.RegClass:$rs1,
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ fvti.AVL, fvti.Log2SEW, TA_MA)>;
+ }
+}
} // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index d91923b..56a38bb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1499,18 +1499,25 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call,
Register ResultReg = Call->ReturnRegister;
- // Deduce the `Scope` operand from the builtin function name.
- SPIRV::Scope::Scope ScopeArg =
- StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
- .EndsWith("device", SPIRV::Scope::Scope::Device)
- .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
- .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
- Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR);
-
- MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
- .addDef(ResultReg)
- .addUse(GR->getSPIRVTypeID(Call->ReturnType))
- .addUse(ScopeReg);
+ if (Builtin->Name == "__spirv_ReadClockKHR") {
+ MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+ .addDef(ResultReg)
+ .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+ .addUse(Call->Arguments[0]);
+ } else {
+ // Deduce the `Scope` operand from the builtin function name.
+ SPIRV::Scope::Scope ScopeArg =
+ StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
+ .EndsWith("device", SPIRV::Scope::Scope::Device)
+ .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
+ .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
+ Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR);
+
+ MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+ .addDef(ResultReg)
+ .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+ .addUse(ScopeReg);
+ }
return true;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 3b8764a..c259cce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1174,6 +1174,7 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
//===----------------------------------------------------------------------===//
// Class defining an atomic instruction on floating-point numbers.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index f0ac26b..14097d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1336,22 +1336,25 @@ def pmax : PatFrags<(ops node:$lhs, node:$rhs), [
]>;
defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
+multiclass PMinMaxInt<Vec vec, NI baseMinInst, NI baseMaxInst> {
+ def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$rhs)),
+ (vec.vt (bitconvert V128:$lhs))),
+ V128:$rhs, V128:$lhs)),
+ (baseMinInst $lhs, $rhs)>;
+ def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$lhs)),
+ (vec.vt (bitconvert V128:$rhs))),
+ V128:$rhs, V128:$lhs)),
+ (baseMaxInst $lhs, $rhs)>;
+}
// Also match the pmin/pmax cases where the operands are int vectors (but the
// comparison is still a floating point comparison). This can happen when using
// the wasm_simd128.h intrinsics because v128_t is an integer vector.
foreach vec = [F32x4, F64x2, F16x8] in {
-defvar pmin = !cast<NI>("PMIN_"#vec);
-defvar pmax = !cast<NI>("PMAX_"#vec);
-def : Pat<(vec.int_vt (vselect
- (setolt (vec.vt (bitconvert V128:$rhs)),
- (vec.vt (bitconvert V128:$lhs))),
- V128:$rhs, V128:$lhs)),
- (pmin $lhs, $rhs)>;
-def : Pat<(vec.int_vt (vselect
- (setolt (vec.vt (bitconvert V128:$lhs)),
- (vec.vt (bitconvert V128:$rhs))),
- V128:$rhs, V128:$lhs)),
- (pmax $lhs, $rhs)>;
+ defvar pmin = !cast<NI>("PMIN_"#vec);
+ defvar pmax = !cast<NI>("PMAX_"#vec);
+ defm : PMinMaxInt<vec, pmin, pmax>;
}
// And match the pmin/pmax LLVM intrinsics as well
@@ -1756,6 +1759,15 @@ let Predicates = [HasRelaxedSIMD] in {
(relaxed_max V128:$lhs, V128:$rhs)>;
def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
(relaxed_max V128:$lhs, V128:$rhs)>;
+
+ // Transform pmin/max-supposed patterns to relaxed min max
+ let AddedComplexity = 1 in {
+ def : Pat<(vec.vt (pmin (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min $lhs, $rhs)>;
+ def : Pat<(vec.vt (pmax (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max $lhs, $rhs)>;
+ defm : PMinMaxInt<vec, relaxed_min, relaxed_max>;
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a..4dfc400 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2632,6 +2632,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(Op, MVT::f32, Promote);
}
+ setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+ setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+ setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
ISD::SCALAR_TO_VECTOR,
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 62a3c88..975a271 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -433,6 +433,8 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["fp8e5m3-insts"] = true;
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
+ Features["add-min-max-insts"] = true;
+ Features["pk-add-min-max-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index a0f7ec6..2dd0fde 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -948,17 +948,17 @@ void llvm::updateVCallVisibilityInIndex(
// linker, as we have no information on their eventual use.
if (DynamicExportSymbols.count(P.first))
continue;
+ // With validation enabled, we want to exclude symbols visible to regular
+ // objects. Local symbols will be in this group due to the current
+ // implementation but those with VCallVisibilityTranslationUnit will have
+ // already been marked in clang so are unaffected.
+ if (VisibleToRegularObjSymbols.count(P.first))
+ continue;
for (auto &S : P.second.getSummaryList()) {
auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
if (!GVar ||
GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
continue;
- // With validation enabled, we want to exclude symbols visible to regular
- // objects. Local symbols will be in this group due to the current
- // implementation but those with VCallVisibilityTranslationUnit will have
- // already been marked in clang so are unaffected.
- if (VisibleToRegularObjSymbols.count(P.first))
- continue;
GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
}
}
@@ -1161,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets(
// and therefore the same GUID. This can happen if there isn't enough
// distinguishing path when compiling the source file. In that case we
// conservatively return false early.
+ if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1)
+ return false;
const GlobalVarSummary *VS = nullptr;
- bool LocalFound = false;
for (const auto &S : P.VTableVI.getSummaryList()) {
- if (GlobalValue::isLocalLinkage(S->linkage())) {
- if (LocalFound)
- return false;
- LocalFound = true;
- }
auto *CurVS = cast<GlobalVarSummary>(S->getBaseObject());
if (!CurVS->vTableFuncs().empty() ||
// Previously clang did not attach the necessary type metadata to
@@ -1184,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets(
// with public LTO visibility.
if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
return false;
+ break;
}
}
// There will be no VS if all copies are available_externally having no
@@ -1411,9 +1408,8 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
// If the summary list contains multiple summaries where at least one is
// a local, give up, as we won't know which (possibly promoted) name to use.
- for (const auto &S : TheFn.getSummaryList())
- if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
- return false;
+ if (TheFn.hasLocal() && Size > 1)
+ return false;
// Collect functions devirtualized at least for one call site for stats.
if (PrintSummaryDevirt || AreStatisticsEnabled())
@@ -2591,6 +2587,11 @@ void DevirtIndex::run() {
if (ExportSummary.typeIdCompatibleVtableMap().empty())
return;
+ // Assert that we haven't made any changes that would affect the hasLocal()
+ // flag on the GUID summary info.
+ assert(!ExportSummary.withInternalizeAndPromote() &&
+ "Expect index-based WPD to run before internalization and promotion");
+
DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 73ec451..9bee523 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2760,21 +2760,34 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
// Optimize pointer differences into the same array into a size. Consider:
// &A[10] - &A[0]: we should compile this to "10".
Value *LHSOp, *RHSOp;
- if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
- match(Op1, m_PtrToInt(m_Value(RHSOp))))
+ if (match(Op0, m_PtrToIntOrAddr(m_Value(LHSOp))) &&
+ match(Op1, m_PtrToIntOrAddr(m_Value(RHSOp))))
if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
I.hasNoUnsignedWrap()))
return replaceInstUsesWith(I, Res);
// trunc(p)-trunc(q) -> trunc(p-q)
- if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
- match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+ if (match(Op0, m_Trunc(m_PtrToIntOrAddr(m_Value(LHSOp)))) &&
+ match(Op1, m_Trunc(m_PtrToIntOrAddr(m_Value(RHSOp)))))
if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
/* IsNUW */ false))
return replaceInstUsesWith(I, Res);
- if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
- match(Op1, m_ZExtOrSelf(m_PtrToInt(m_Value(RHSOp))))) {
+ auto MatchSubOfZExtOfPtrToIntOrAddr = [&]() {
+ if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+ match(Op1, m_ZExt(m_PtrToIntSameSize(DL, m_Value(RHSOp)))))
+ return true;
+ if (match(Op0, m_ZExt(m_PtrToAddr(m_Value(LHSOp)))) &&
+ match(Op1, m_ZExt(m_PtrToAddr(m_Value(RHSOp)))))
+ return true;
+ // Special case for non-canonical ptrtoint in constant expression,
+ // where the zext has been folded into the ptrtoint.
+ if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+ match(Op1, m_PtrToInt(m_Value(RHSOp))))
+ return true;
+ return false;
+ };
+ if (MatchSubOfZExtOfPtrToIntOrAddr()) {
if (auto *GEP = dyn_cast<GEPOperator>(LHSOp)) {
if (GEP->getPointerOperand() == RHSOp) {
if (GEP->hasNoUnsignedWrap() || GEP->hasNoUnsignedSignedWrap()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dab200d..669d4f0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4003,18 +4003,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
// Try to fold intrinsic into select/phi operands. This is legal if:
// * The intrinsic is speculatable.
- // * The select condition is not a vector, or the intrinsic does not
- // perform cross-lane operations.
- if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) &&
- isNotCrossLaneOperation(II))
+ // * The operand is one of the following:
+ // - a phi.
+ // - a select with a scalar condition.
+ // - a select with a vector condition and II is not a cross lane operation.
+ if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) {
for (Value *Op : II->args()) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
- if (Instruction *R = FoldOpIntoSelect(*II, Sel))
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+ bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy();
+ if (IsVectorCond && !isNotCrossLaneOperation(II))
+ continue;
+ // Don't replace a scalar select with a more expensive vector select if
+ // we can't simplify both arms of the select.
+ bool SimplifyBothArms =
+ !Op->getType()->isVectorTy() && II->getType()->isVectorTy();
+ if (Instruction *R = FoldOpIntoSelect(
+ *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms))
return R;
+ }
if (auto *Phi = dyn_cast<PHINode>(Op))
if (Instruction *R = foldOpIntoPhi(*II, Phi))
return R;
}
+ }
if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
return Shuf;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 943c223..ede73f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -664,7 +664,8 @@ public:
/// This also works for Cast instructions, which obviously do not have a
/// second operand.
Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse = false);
+ bool FoldWithMultiUse = false,
+ bool SimplifyBothArms = false);
/// This is a convenience wrapper function for the above two functions.
Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3f11cae..67e2aae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
}
Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse) {
+ bool FoldWithMultiUse,
+ bool SimplifyBothArms) {
// Don't modify shared select instructions unless set FoldWithMultiUse
if (!SI->hasOneUse() && !FoldWithMultiUse)
return nullptr;
@@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
if (!NewTV && !NewFV)
return nullptr;
+ if (SimplifyBothArms && !(NewTV && NewFV))
+ return nullptr;
+
// Create an instruction for the arm that did not fold.
if (!NewTV)
NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this);
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 4acc3f2..d347ced 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -614,6 +614,16 @@ static Decomposition decompose(Value *V,
return {V, IsKnownNonNegative};
}
+ if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
+ canUseSExt(CI)) {
+ Preconditions.emplace_back(
+ CmpInst::ICMP_UGE, Op0,
+ ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
+ if (auto Decomp = MergeResults(Op0, CI, true))
+ return *Decomp;
+ return {V, IsKnownNonNegative};
+ }
+
if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
if (!isKnownNonNegative(Op0, DL))
Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
@@ -627,16 +637,6 @@ static Decomposition decompose(Value *V,
return {V, IsKnownNonNegative};
}
- if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
- canUseSExt(CI)) {
- Preconditions.emplace_back(
- CmpInst::ICMP_UGE, Op0,
- ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
- if (auto Decomp = MergeResults(Op0, CI, true))
- return *Decomp;
- return {V, IsKnownNonNegative};
- }
-
// Decompose or as an add if there are no common bits between the operands.
if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) {
if (auto Decomp = MergeResults(Op0, CI, IsSigned))
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index a83cbd17a7..f273e9d 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -64,10 +64,10 @@
using namespace llvm;
-namespace {
-
#define DEBUG_TYPE "mergeicmps"
+namespace {
+
// A BCE atom "Binary Compare Expression Atom" represents an integer load
// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
// at the top.
@@ -128,11 +128,12 @@ private:
unsigned Order = 1;
DenseMap<const Value*, int> BaseToIndex;
};
+} // namespace
// If this value is a load from a constant offset w.r.t. a base address, and
// there are no other users of the load or address, returns the base address and
// the offset.
-BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
+static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
auto *const LoadI = dyn_cast<LoadInst>(Val);
if (!LoadI)
return {};
@@ -175,6 +176,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset);
}
+namespace {
// A comparison between two BCE atoms, e.g. `a == o.a` in the example at the
// top.
// Note: the terminology is misleading: the comparison is symmetric, so there
@@ -239,6 +241,7 @@ class BCECmpBlock {
private:
BCECmp Cmp;
};
+} // namespace
bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
AliasAnalysis &AA) const {
@@ -302,9 +305,9 @@ bool BCECmpBlock::doesOtherWork() const {
// Visit the given comparison. If this is a comparison between two valid
// BCE atoms, returns the comparison.
-std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
- const ICmpInst::Predicate ExpectedPredicate,
- BaseIdentifier &BaseId) {
+static std::optional<BCECmp>
+visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) {
// The comparison can only be used once:
// - For intermediate blocks, as a branch condition.
// - For the final block, as an incoming value for the Phi.
@@ -332,10 +335,9 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
// Visit the given comparison block. If this is a comparison between two valid
// BCE atoms, returns the comparison.
-std::optional<BCECmpBlock> visitCmpBlock(Value *const Val,
- BasicBlock *const Block,
- const BasicBlock *const PhiBlock,
- BaseIdentifier &BaseId) {
+static std::optional<BCECmpBlock>
+visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) {
if (Block->empty())
return std::nullopt;
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
@@ -397,6 +399,7 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
Comparisons.push_back(std::move(Comparison));
}
+namespace {
// A chain of comparisons.
class BCECmpChain {
public:
@@ -420,6 +423,7 @@ private:
// The original entry block (before sorting);
BasicBlock *EntryBlock_;
};
+} // namespace
static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) {
return First.Lhs().BaseId == Second.Lhs().BaseId &&
@@ -742,9 +746,8 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
return true;
}
-std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
- BasicBlock *const LastBlock,
- int NumBlocks) {
+static std::vector<BasicBlock *>
+getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) {
// Walk up from the last block to find other blocks.
std::vector<BasicBlock *> Blocks(NumBlocks);
assert(LastBlock && "invalid last block");
@@ -777,8 +780,8 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
return Blocks;
}
-bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
- DomTreeUpdater &DTU) {
+static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA, DomTreeUpdater &DTU) {
LLVM_DEBUG(dbgs() << "processPhi()\n");
if (Phi.getNumIncomingValues() <= 1) {
LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
@@ -874,6 +877,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
return MadeChange;
}
+namespace {
class MergeICmpsLegacyPass : public FunctionPass {
public:
static char ID;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d2c100c9..3356516 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7231,6 +7231,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
return DenseMap<const SCEV *, Value *>();
}
+ VPlanTransforms::narrowInterleaveGroups(
+ BestVPlan, BestVF,
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8199,10 +8202,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
-
- if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
- VPlans.push_back(std::move(P));
-
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c95c887..428a8f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1191,7 +1191,6 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
- Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 167ba55..a1ad2db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2712,7 +2712,8 @@ public:
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2783,7 +2784,10 @@ public:
Opcode(Opcode), VFScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getChainOp()->getDefiningRecipe();
- assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+ // When cloning as part of a VPExpressionRecipe the chain op could have
+ // replaced by a temporary VPValue, so it doesn't have a defining recipe.
+ assert((!AccumulatorRecipe ||
+ isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
@@ -3093,6 +3097,11 @@ public:
/// removed before codegen.
void decompose();
+ unsigned getVFScaleFactor() const {
+ auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back());
+ return PR ? PR->getVFScaleFactor() : 1;
+ }
+
/// Method for generating code, must not be called as this recipe is abstract.
void execute(VPTransformState &State) override {
llvm_unreachable("recipe must be removed before execute");
@@ -4152,9 +4161,6 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
- /// Represents the symbolic unroll factor of the loop.
- VPValue UF;
-
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4308,9 +4314,6 @@ public:
VPValue &getVF() { return VF; };
const VPValue &getVF() const { return VF; };
- /// Returns the symbolic UF of the vector loop region.
- VPValue &getSymbolicUF() { return UF; };
-
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
@@ -4320,12 +4323,6 @@ public:
void addVF(ElementCount VF) { VFs.insert(VF); }
- /// Remove \p VF from the plan.
- void removeVF(ElementCount VF) {
- assert(hasVF(VF) && "tried to remove VF not present in plan");
- VFs.remove(VF);
- }
-
void setVF(ElementCount VF) {
assert(hasVF(VF) && "Cannot set VF not already in plan");
VFs.clear();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f1b42b..931a5b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
+ case VPPartialReductionSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
@@ -300,14 +301,23 @@ InstructionCost
VPPartialReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
std::optional<unsigned> Opcode;
- VPValue *Op = getOperand(0);
- VPRecipeBase *OpR = Op->getDefiningRecipe();
-
- // If the partial reduction is predicated, a select will be operand 0
- if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
- OpR = Op->getDefiningRecipe();
+ VPValue *Op = getVecOp();
+ uint64_t MulConst;
+ // If the partial reduction is predicated, a select will be operand 1.
+ // If it isn't predicated and the mul isn't operating on a constant, then it
+ // should have been turned into a VPExpressionRecipe.
+ // FIXME: Replace the entire function with this once all partial reduction
+ // variants are bundled into VPExpressionRecipe.
+ if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
+ !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
+ auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
+ auto *InputType = Ctx.Types.inferScalarType(getVecOp());
+ return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
+ PhiType, VF, TTI::PR_None,
+ TTI::PR_None, {}, Ctx.CostKind);
}
+ VPRecipeBase *OpR = Op->getDefiningRecipe();
Type *InputTypeA = nullptr, *InputTypeB = nullptr;
TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
ExtBType = TTI::PR_None;
@@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
- return Ctx.TTI.getExtendedReductionCost(
- Opcode,
- cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
- Instruction::ZExt,
- RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
+ auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+ return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
+ ? Ctx.TTI.getPartialReductionCost(
+ Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
+ RedTy, VF,
+ TargetTransformInfo::getPartialReductionExtendKind(
+ ExtR->getOpcode()),
+ TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
+ : Ctx.TTI.getExtendedReductionCost(
+ Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
+ SrcVecTy, std::nullopt, Ctx.CostKind);
}
case ExpressionTypes::MulAccReduction:
return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
@@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
Opcode = Instruction::Sub;
[[fallthrough]];
case ExpressionTypes::ExtMulAccReduction: {
+ if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
+ auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+ auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+ auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+ return Ctx.TTI.getPartialReductionCost(
+ Opcode, Ctx.Types.inferScalarType(getOperand(0)),
+ Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext0R->getOpcode()),
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext1R->getOpcode()),
+ Mul->getOpcode(), Ctx.CostKind);
+ }
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
@@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " = ";
auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
getOperand(1)->printAsOperand(O, SlotTracker);
- O << " +";
- O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(Opcode) << " (";
getOperand(0)->printAsOperand(O, SlotTracker);
Red->printFlags(O);
@@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
}
case ExpressionTypes::ExtNegatedMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
- O << " + reduce."
- << Instruction::getOpcodeName(
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (sub (0, mul";
auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
@@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
case ExpressionTypes::MulAccReduction:
case ExpressionTypes::ExtMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
- O << " + ";
- O << "reduce."
- << Instruction::getOpcodeName(
+ O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+ O << Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (";
O << "mul";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48cf763..3e85e6f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3519,18 +3519,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
VPValue *VecOp = Red->getVecOp();
// Clamp the range if using extended-reduction is profitable.
- auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
- Type *SrcTy) -> bool {
+ auto IsExtendedRedValidAndClampRange =
+ [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
return LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
- Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
- CostKind);
+
+ InstructionCost ExtRedCost;
InstructionCost ExtCost =
cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
+
+ if (isa<VPPartialReductionRecipe>(Red)) {
+ TargetTransformInfo::PartialReductionExtendKind ExtKind =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtOpc);
+ // FIXME: Move partial reduction creation, costing and clamping
+ // here from LoopVectorize.cpp.
+ ExtRedCost = Ctx.TTI.getPartialReductionCost(
+ Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
+ llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
+ } else {
+ ExtRedCost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
+ Red->getFastMathFlags(), CostKind);
+ }
return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
},
Range);
@@ -3541,8 +3554,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
IsExtendedRedValidAndClampRange(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
- cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
- Instruction::CastOps::ZExt,
+ cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
Ctx.Types.inferScalarType(A)))
return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
@@ -3560,6 +3572,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
+
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return nullptr;
@@ -3568,16 +3582,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// Clamp the range if using multiply-accumulate-reduction is profitable.
auto IsMulAccValidAndClampRange =
- [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
- VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
+ [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenCastRecipe *OuterExt) -> bool {
return LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *SrcTy =
Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
- auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
- InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
- isZExt, Opcode, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulAccCost;
+
+ if (IsPartialReduction) {
+ Type *SrcTy2 =
+ Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
+ // FIXME: Move partial reduction creation, costing and clamping
+ // here from LoopVectorize.cpp.
+ MulAccCost = Ctx.TTI.getPartialReductionCost(
+ Opcode, SrcTy, SrcTy2, RedTy, VF,
+ Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
+ Ext0->getOpcode())
+ : TargetTransformInfo::PR_None,
+ Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
+ Ext1->getOpcode())
+ : TargetTransformInfo::PR_None,
+ Mul->getOpcode(), CostKind);
+ } else {
+ // Only partial reductions support mixed extends at the moment.
+ if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
+ return false;
+
+ bool IsZExt =
+ !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
+ SrcVecTy, CostKind);
+ }
+
InstructionCost MulCost = Mul->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
InstructionCost ExtCost = 0;
@@ -3611,14 +3650,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
- // Match reduce.add(mul(ext, ext)).
- if (RecipeA && RecipeB &&
- (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
- match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
+ // Match reduce.add/sub(mul(ext, ext)).
+ if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
- IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
- Instruction::CastOps::ZExt,
- Mul, RecipeA, RecipeB, nullptr)) {
+ IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
if (Sub)
return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
cast<VPWidenRecipe>(Sub), Red);
@@ -3626,8 +3661,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
}
// Match reduce.add(mul).
// TODO: Add an expression type for this variant with a negated mul
- if (!Sub &&
- IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+ if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
return new VPExpressionRecipe(Mul, Red);
}
// TODO: Add an expression type for negated versions of other expression
@@ -3647,9 +3681,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
Ext0->getOpcode() == Ext1->getOpcode() &&
- IsMulAccValidAndClampRange(Ext0->getOpcode() ==
- Instruction::CastOps::ZExt,
- Mul, Ext0, Ext1, Ext)) {
+ IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
auto *NewExt0 = new VPWidenCastRecipe(
Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
*Ext0, Ext0->getDebugLoc());
@@ -3956,9 +3988,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
// used.
// TODO: Assert that they aren't used.
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
- Plan.getSymbolicUF().replaceAllUsesWith(UF);
-
// If there are no users of the runtime VF, compute VFxUF by constant folding
// the multiplication of VF and UF.
if (VF.getNumUsers() == 0) {
@@ -3978,6 +4007,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
+ VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -4045,14 +4075,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}
-/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
-/// number of members both equal to VF. The interleave group must also access
-/// the full vector width.
-static std::optional<ElementCount> isConsecutiveInterleaveGroup(
- VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
- VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
+/// Returns true if \p IR is a full interleave group with factor and number of
+/// members both equal to \p VF. The interleave group must also access the full
+/// vector width \p VectorRegWidth.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
+ unsigned VF, VPTypeAnalysis &TypeInfo,
+ unsigned VectorRegWidth) {
if (!InterleaveR || InterleaveR->getMask())
- return std::nullopt;
+ return false;
Type *GroupElementTy = nullptr;
if (InterleaveR->getStoredValues().empty()) {
@@ -4061,7 +4091,7 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup(
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return std::nullopt;
+ return false;
} else {
GroupElementTy =
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -4069,27 +4099,13 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup(
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return std::nullopt;
+ return false;
}
- auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
- TypeSize Size = TTI.getRegisterBitWidth(
- VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
- : TargetTransformInfo::RGK_ScalableVector);
- assert(Size.isScalable() == VF.isScalable() &&
- "if Size is scalable, VF must to and vice versa");
- return Size.getKnownMinValue();
- };
-
- for (ElementCount VF : VFs) {
- unsigned MinVal = VF.getKnownMinValue();
- unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
- auto IG = InterleaveR->getInterleaveGroup();
- if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
- GroupSize == GetVectorWidthForVF(VF))
- return {VF};
- }
- return std::nullopt;
+ unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
+ auto IG = InterleaveR->getInterleaveGroup();
+ return IG->getFactor() == VF && IG->getNumMembers() == VF &&
+ GroupSize == VectorRegWidth;
}
/// Returns true if \p VPValue is a narrow VPValue.
@@ -4100,18 +4116,16 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-std::unique_ptr<VPlan>
-VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
- const TargetTransformInfo &TTI) {
- using namespace llvm::VPlanPatternMatch;
+void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+ unsigned VectorRegWidth) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-
if (!VectorLoop)
- return nullptr;
+ return;
VPTypeAnalysis TypeInfo(Plan);
+
+ unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
- std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
continue;
@@ -4125,33 +4139,30 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return nullptr;
+ return;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return nullptr;
+ return;
+
+ // Do not narrow interleave groups if there are VectorPointer recipes and
+ // the plan was unrolled. The recipe implicitly uses VF from
+ // VPTransformState.
+ // TODO: Remove restriction once the VF for the VectorPointer offset is
+ // modeled explicitly as operand.
+ if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
+ return;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Try to find a single VF, where all interleave groups are consecutive and
- // saturate the full vector width. If we already have a candidate VF, check
- // if it is applicable for the current InterleaveR, otherwise look for a
- // suitable VF across the Plans VFs.
- //
- if (VFToOptimize) {
- if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
- TTI))
- return nullptr;
- } else {
- if (auto VF = isConsecutiveInterleaveGroup(
- InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
- VFToOptimize = *VF;
- else
- return nullptr;
- }
+ // Bail out on non-consecutive interleave groups.
+ if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
+ VectorRegWidth))
+ return;
+
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -4185,34 +4196,24 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return nullptr;
+ return;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return nullptr;
+ return;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return nullptr;
+ return;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return nullptr;
-
- // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
- // original Plan into 2: a) a new clone which contains all VFs of Plan, except
- // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
- std::unique_ptr<VPlan> NewPlan;
- if (size(Plan.vectorFactors()) != 1) {
- NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
- Plan.setVF(*VFToOptimize);
- NewPlan->removeVF(*VFToOptimize);
- }
+ return;
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4283,8 +4284,9 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
- VPValue *UF = &Plan.getSymbolicUF();
- if (VFToOptimize->isScalable()) {
+ VPValue *UF = Plan.getOrAddLiveIn(
+ ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ if (VF.isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
CanIV->getScalarType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4296,10 +4298,6 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
}
removeDeadRecipes(Plan);
- assert(none_of(*VectorLoop->getEntryBasicBlock(),
- IsaPred<VPVectorPointerRecipe>) &&
- "All VPVectorPointerRecipes should have been removed");
- return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ca8d956..b28559b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -341,20 +341,14 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);
- /// Try to find a single VF among \p Plan's VFs for which all interleave
- /// groups (with known minimum VF elements) can be replaced by wide loads and
- /// stores processing VF elements, if all transformed interleave groups access
- /// the full vector width (checked via the maximum vector register width). If
- /// the transformation can be applied, the original \p Plan will be split in
- /// 2:
- /// 1. The original Plan with the single VF containing the optimized recipes
- /// using wide loads instead of interleave groups.
- /// 2. A new clone which contains all VFs of Plan except the optimized VF.
- ///
- /// This effectively is a very simple form of loop-aware SLP, where we use
- /// interleave groups to identify candidates.
- static std::unique_ptr<VPlan>
- narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
+ /// Try to convert a plan with interleave groups with VF elements to a plan
+ /// with the interleave groups replaced by wide loads and stores processing VF
+ /// elements, if all transformed interleave groups access the full vector
+ /// width (checked via \o VectorRegWidth). This effectively is a very simple
+ /// form of loop-aware SLP, where we use interleave groups to identify
+ /// candidates.
+ static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+ unsigned VectorRegWidth);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 32e4b88..06c3d75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -151,6 +151,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
return RR->getVFScaleFactor();
+ if (auto *ER = dyn_cast<VPExpressionRecipe>(R))
+ return ER->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
VPInstruction::ReductionStartVector) &&