aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp13
-rw-r--r--llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp4
-rw-r--r--llvm/lib/Analysis/UniformityAnalysis.cpp1
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp30
-rw-r--r--llvm/lib/BinaryFormat/SFrame.cpp33
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp5
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp24
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp20
-rw-r--r--llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp22
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.cpp2
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.h2
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIParser.cpp12
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineInstr.cpp6
-rw-r--r--llvm/lib/CodeGen/MachineOperand.cpp4
-rw-r--r--llvm/lib/CodeGen/ModuloSchedule.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp179
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp138
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp93
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp14
-rw-r--r--llvm/lib/CodeGen/WindowsSecureHotPatching.cpp13
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h3
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp7
-rw-r--r--llvm/lib/IR/IRBuilder.cpp29
-rw-r--r--llvm/lib/IR/Intrinsics.cpp24
-rw-r--r--llvm/lib/IR/Metadata.cpp2
-rw-r--r--llvm/lib/LTO/LTO.cpp83
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp6
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp124
-rw-r--r--llvm/lib/MC/MCWin64EH.cpp3
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp5
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp12
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp2
-rw-r--r--llvm/lib/ObjCopy/COFF/COFFReader.cpp2
-rw-r--r--llvm/lib/Object/SFrameParser.cpp72
-rw-r--r--llvm/lib/Support/Unix/Path.inc2
-rw-r--r--llvm/lib/Support/Windows/Threading.inc60
-rw-r--r--llvm/lib/TableGen/Record.cpp26
-rw-r--r--llvm/lib/TableGen/TGLexer.cpp2
-rw-r--r--llvm/lib/TableGen/TGLexer.h2
-rw-r--r--llvm/lib/TableGen/TGParser.cpp53
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp130
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp57
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td61
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h18
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td122
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td77
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp33
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp41
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td130
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h3
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrP10.td116
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp28
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp133
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td52
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp52
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp15
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp50
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp245
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp45
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp28
-rw-r--r--llvm/lib/Target/X86/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp6
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp34
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp117
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp154
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp16
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp38
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp80
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp20
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp143
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp61
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp11
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp86
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp12
125 files changed, 3021 insertions, 849 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 759c553..2d52f34 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1373,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand))
return flushDenormalConstantFP(CFP, Inst, IsOutput);
- if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand))
+ if (isa<ConstantAggregateZero, UndefValue>(Operand))
return Operand;
Type *Ty = Operand->getType();
@@ -1389,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
Ty = VecTy->getElementType();
}
+ if (isa<ConstantExpr>(Operand))
+ return Operand;
+
if (const auto *CV = dyn_cast<ConstantVector>(Operand)) {
SmallVector<Constant *, 16> NewElts;
for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
@@ -2628,14 +2631,14 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::nvvm_ceil_d:
return ConstantFoldFP(
ceil, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_fabs_ftz:
case Intrinsic::nvvm_fabs:
return ConstantFoldFP(
fabs, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_floor_ftz_f:
@@ -2643,7 +2646,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::nvvm_floor_d:
return ConstantFoldFP(
floor, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_rcp_rm_ftz_f:
@@ -2705,7 +2708,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
return nullptr;
return ConstantFoldFP(
sqrt, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
// AMDGCN Intrinsics:
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index c871070..7025b83 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const {
Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
@@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const {
Result.TBAA = Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 15107c2..2e4063f 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -178,6 +178,7 @@ bool UniformityInfoWrapperPass::runOnFunction(Function &F) {
void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
OS << "UniformityInfo for function '" << m_function->getName() << "':\n";
+ m_uniformityInfo.print(OS);
}
void UniformityInfoWrapperPass::releaseMemory() {
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1b3da59..e9cf2ee 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::exp:
case Intrinsic::exp10:
case Intrinsic::exp2:
+ case Intrinsic::ldexp:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::canonicalize:
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
switch (ID) {
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
case Intrinsic::powi:
+ case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
default:
return OpdIdx == -1;
@@ -240,30 +246,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}
-struct InterleaveIntrinsic {
- Intrinsic::ID Interleave, Deinterleave;
-};
-
-static InterleaveIntrinsic InterleaveIntrinsics[] = {
- {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
- {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
- {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
- {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
- {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
- {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
- {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
-};
-
-Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Interleave;
-}
-
-Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Deinterleave;
-}
-
unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::vector_interleave2:
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
index 3b436af..f1765d7 100644
--- a/llvm/lib/BinaryFormat/SFrame.cpp
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -35,3 +35,36 @@ ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() {
};
return ArrayRef(ABIs);
}
+
+ArrayRef<EnumEntry<sframe::FREType>> sframe::getFRETypes() {
+ static constexpr EnumEntry<sframe::FREType> FRETypes[] = {
+#define HANDLE_SFRAME_FRE_TYPE(CODE, NAME) {#NAME, sframe::FREType::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FRETypes);
+}
+
+ArrayRef<EnumEntry<sframe::FDEType>> sframe::getFDETypes() {
+ static constexpr EnumEntry<sframe::FDEType> FDETypes[] = {
+#define HANDLE_SFRAME_FDE_TYPE(CODE, NAME) {#NAME, sframe::FDEType::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FDETypes);
+}
+
+ArrayRef<EnumEntry<sframe::AArch64PAuthKey>> sframe::getAArch64PAuthKeys() {
+ static constexpr EnumEntry<sframe::AArch64PAuthKey> AArch64PAuthKeys[] = {
+#define HANDLE_SFRAME_AARCH64_PAUTH_KEY(CODE, NAME) \
+ {#NAME, sframe::AArch64PAuthKey::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(AArch64PAuthKeys);
+}
+
+ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() {
+ static constexpr EnumEntry<sframe::FREOffset> FREOffsets[] = {
+#define HANDLE_SFRAME_FRE_OFFSET(CODE, NAME) {#NAME, sframe::FREOffset::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FREOffsets);
+}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index a7c99b1..dcfd9aa 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2103,8 +2103,9 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
DI->eraseFromParent();
return;
}
-
- DI->setDebugValueUndef();
+ // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef.
+ if (!DI->isDebugLabel())
+ DI->setDebugValueUndef();
DI->moveBefore(&*Loc);
};
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8855740f..9b2851e 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
llvm_unreachable("Deinterleave node should already have ReplacementNode");
break;
case ComplexDeinterleavingOperation::Splat: {
- auto *NewTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(Node->Real->getType()));
auto *R = dyn_cast<Instruction>(Node->Real);
auto *I = dyn_cast<Instruction>(Node->Imag);
if (R && I) {
// Splats that are not constant are interleaved where they are located
Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
IRBuilder<> IRB(InsertPoint);
- ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewTy, {Node->Real, Node->Imag});
+ ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
} else {
- ReplacementNode = Builder.CreateIntrinsic(
- Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
+ ReplacementNode =
+ Builder.CreateVectorInterleave({Node->Real, Node->Imag});
}
break;
}
@@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
auto *A = replaceNode(Builder, Node->Operands[0]);
auto *B = replaceNode(Builder, Node->Operands[1]);
- auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(MaskReal->getType()));
- auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewMaskTy, {MaskReal, MaskImag});
+ auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
ReplacementNode = Builder.CreateSelect(NewMask, A, B);
break;
}
@@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
}
if (!NewInit)
- NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {Init, Constant::getNullValue(VTy)});
+ NewInit =
+ Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation(
auto *OldPHIImag = ReductionInfo[Imag].first;
auto *NewPHI = OldToNewPHI[OldPHIReal];
- auto *VTy = cast<VectorType>(Real->getType());
- auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
-
// We have to interleave initial origin values coming from IncomingBlock
Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming);
Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
IRBuilder<> Builder(Incoming->getTerminator());
- auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {InitReal, InitImag});
+ auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 012d873..9ba1782 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
@@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8f513a..e84ba91 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
const TargetOptions &Options = MF->getTarget().Options;
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- if (CanReassociate &&
- !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+ if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc))
return false;
// Floating-point multiply-add with intermediate rounding.
@@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
if (!HasFMAD && !HasFMA)
return false;
- AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath || HasFMAD;
+ AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dc5dfab..fd38c30 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
for (unsigned i = 0; i < Regs.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(LI);
@@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
for (unsigned i = 0; i < Vals.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(SI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f..d9d3569 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
LargeSplitSize / 8);
Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
- auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst);
auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
SmallPtr, *SmallMMO);
@@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
LLT PtrTy = MRI.getType(PtrReg);
auto OffsetCst = MIRBuilder.buildConstant(
LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
- auto SmallPtr =
- MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst);
MachineMemOperand *LargeMMO =
MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
@@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
unsigned ByteOffset = Offset / 8;
Register NewAddrReg;
- MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
+ MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy,
+ ByteOffset);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
@@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
- if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
+ if (MI.getFlag(MachineInstr::FmAfn)) {
unsigned Flags = MI.getFlags();
auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
@@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
if (DstOff != 0) {
auto Offset =
MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
- Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0);
}
MIB.buildStore(Value, Ptr, *StoreMMO);
@@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
.getReg(0);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
@@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
Register StorePtr = Dst;
if (CurrOffset != 0) {
LLT DstTy = MRI.getType(Dst);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LdVal, StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
@@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
auto Offset =
MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
CurrOffset += CopyTy.getSizeInBytes();
@@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT DstTy = MRI.getType(Dst);
auto Offset =
MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 121d7e8..27df7e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -208,11 +208,20 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0,
return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags);
}
+MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res,
+ const SrcOp &Op0,
+ const SrcOp &Op1) {
+ return buildPtrAdd(Res, Op0, Op1,
+ MachineInstr::MIFlag::NoUWrap |
+ MachineInstr::MIFlag::InBounds);
+}
+
std::optional<MachineInstrBuilder>
MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
- const LLT ValueTy, uint64_t Value) {
+ const LLT ValueTy, uint64_t Value,
+ std::optional<unsigned> Flags) {
assert(Res == 0 && "Res is a result argument");
- assert(ValueTy.isScalar() && "invalid offset type");
+ assert(ValueTy.isScalar() && "invalid offset type");
if (Value == 0) {
Res = Op0;
@@ -221,7 +230,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0));
auto Cst = buildConstant(ValueTy, Value);
- return buildPtrAdd(Res, Op0, Cst.getReg(0));
+ return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags);
+}
+
+std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset(
+ Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) {
+ return materializePtrAdd(Res, Op0, ValueTy, Value,
+ MachineInstr::MIFlag::NoUWrap |
+ MachineInstr::MIFlag::InBounds);
}
MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 7153902..8b72c29 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -217,6 +217,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
.Case("nneg", MIToken::kw_nneg)
.Case("disjoint", MIToken::kw_disjoint)
.Case("samesign", MIToken::kw_samesign)
+ .Case("inbounds", MIToken::kw_inbounds)
.Case("nofpexcept", MIToken::kw_nofpexcept)
.Case("unpredictable", MIToken::kw_unpredictable)
.Case("debug-location", MIToken::kw_debug_location)
@@ -616,6 +617,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
.Case("!range", MIToken::md_range)
.Case("!DIExpression", MIToken::md_diexpr)
.Case("!DILocation", MIToken::md_dilocation)
+ .Case("!noalias.addrspace", MIToken::md_noalias_addrspace)
.Default(MIToken::Error);
}
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index d7cd067..0627f17 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -78,6 +78,7 @@ struct MIToken {
kw_nneg,
kw_disjoint,
kw_samesign,
+ kw_inbounds,
kw_debug_location,
kw_debug_instr_number,
kw_dbg_instr_ref,
@@ -151,6 +152,7 @@ struct MIToken {
md_tbaa,
md_alias_scope,
md_noalias,
+ md_noalias_addrspace,
md_range,
md_diexpr,
md_dilocation,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 3a364d5..6a464d9 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1477,7 +1477,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
Token.is(MIToken::kw_nneg) ||
Token.is(MIToken::kw_disjoint) ||
Token.is(MIToken::kw_nusw) ||
- Token.is(MIToken::kw_samesign)) {
+ Token.is(MIToken::kw_samesign) ||
+ Token.is(MIToken::kw_inbounds)) {
// clang-format on
// Mine frame and fast math flags
if (Token.is(MIToken::kw_frame_setup))
@@ -1518,6 +1519,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
Flags |= MachineInstr::NoUSWrap;
if (Token.is(MIToken::kw_samesign))
Flags |= MachineInstr::SameSign;
+ if (Token.is(MIToken::kw_inbounds))
+ Flags |= MachineInstr::InBounds;
lex();
}
@@ -3482,6 +3485,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
if (parseMDNode(AAInfo.NoAlias))
return true;
break;
+ case MIToken::md_noalias_addrspace:
+ lex();
+ if (parseMDNode(AAInfo.NoAliasAddrSpace))
+ return true;
+ break;
case MIToken::md_range:
lex();
if (parseMDNode(Range))
@@ -3490,7 +3498,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
// TODO: Report an error on duplicate metadata nodes.
default:
return error("expected 'align' or '!tbaa' or '!alias.scope' or "
- "'!noalias' or '!range'");
+ "'!noalias' or '!range' or '!noalias.addrspace'");
}
}
if (expectAndConsume(MIToken::rparen))
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index ad7835a..ce1834a 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -820,6 +820,8 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
OS << "nusw ";
if (MI.getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ if (MI.getFlag(MachineInstr::InBounds))
+ OS << "inbounds ";
// NOTE: Please add new MIFlags also to the MI_FLAGS_STR in
// llvm/utils/update_mir_test_checks.py.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index da3665b..79047f7 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -585,6 +585,8 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) {
MIFlags |= MachineInstr::MIFlag::NoUSWrap;
if (GEP->hasNoUnsignedWrap())
MIFlags |= MachineInstr::MIFlag::NoUWrap;
+ if (GEP->isInBounds())
+ MIFlags |= MachineInstr::MIFlag::InBounds;
}
// Copy the nonneg flag.
@@ -1860,8 +1862,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
OS << "nneg ";
if (getFlag(MachineInstr::Disjoint))
OS << "disjoint ";
+ if (getFlag(MachineInstr::NoUSWrap))
+ OS << "nusw ";
if (getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ if (getFlag(MachineInstr::InBounds))
+ OS << "inbounds ";
// Print the opcode name.
if (TII)
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d25169..c612f8de 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
OS << ", !noalias ";
AAInfo.NoAlias->printAsOperand(OS, MST);
}
+ if (AAInfo.NoAliasAddrSpace) {
+ OS << ", !noalias.addrspace ";
+ AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST);
+ }
if (getRanges()) {
OS << ", !range ";
getRanges()->printAsOperand(OS, MST);
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f742c4..21bf052 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -423,7 +423,7 @@ void ModuloScheduleExpander::generateExistingPhis(
// potentially define two values.
unsigned MaxPhis = PrologStage + 2;
if (!InKernel && (int)PrologStage <= LoopValStage)
- MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+ MaxPhis = std::max((int)MaxPhis - LoopValStage, 1);
unsigned NumPhis = std::min(NumStages, MaxPhis);
Register NewReg;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 2d7987a..7ede564 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -306,7 +306,12 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
/// number if it is not zero. If DstReg is a physical register and the
/// existing subregister number of the def / use being updated is not zero,
/// make sure to set it to the correct physical subregister.
- void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+ ///
+ /// If \p SubregToRegSrcInst is not empty, we are coalescing a
+ /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an
+ /// implicit-def of DstReg on instructions that define SrcReg.
+ void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
+ ArrayRef<MachineInstr *> SubregToRegSrcInst = {});
/// If the given machine operand reads only undefined lanes add an undef
/// flag.
@@ -1443,6 +1448,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
// CopyMI may have implicit operands, save them so that we can transfer them
// over to the newly materialized instruction after CopyMI is removed.
+ LaneBitmask NewMIImplicitOpsMask;
SmallVector<MachineOperand, 4> ImplicitOps;
ImplicitOps.reserve(CopyMI->getNumOperands() -
CopyMI->getDesc().getNumOperands());
@@ -1457,6 +1463,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
(MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) &&
"unexpected implicit virtual register def");
ImplicitOps.push_back(MO);
+ if (MO.isDef() && MO.getReg().isVirtual() &&
+ MRI->shouldTrackSubRegLiveness(DstReg))
+ NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
}
}
@@ -1499,14 +1508,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
} else {
assert(MO.getReg() == NewMI.getOperand(0).getReg());
- // We're only expecting another def of the main output, so the range
- // should get updated with the regular output range.
- //
- // FIXME: The range updating below probably needs updating to look at
- // the super register if subranges are tracked.
- assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
- "subrange update for implicit-def of super register may not be "
- "properly handled");
+ // If lanemasks need to be tracked, compile the lanemask of the NewMI
+ // implicit def operands to avoid subranges for the super-regs from
+ // being removed by code later on in this function.
+ if (MRI->shouldTrackSubRegLiveness(MO.getReg()))
+ NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
}
}
}
@@ -1606,7 +1612,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
for (LiveInterval::SubRange &SR : DstInt.subranges()) {
- if ((SR.LaneMask & DstMask).none()) {
+ if ((SR.LaneMask & DstMask).none() &&
+ (SR.LaneMask & NewMIImplicitOpsMask).none()) {
LLVM_DEBUG(dbgs()
<< "Removing undefined SubRange "
<< PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
@@ -1870,11 +1877,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
}
}
-void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
- unsigned SubIdx) {
+void RegisterCoalescer::updateRegDefsUses(
+ Register SrcReg, Register DstReg, unsigned SubIdx,
+ ArrayRef<MachineInstr *> SubregToRegSrcInsts) {
bool DstIsPhys = DstReg.isPhysical();
LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
+ // Coalescing a COPY may expose reads of 'undef' subregisters.
+ // If so, then explicitly propagate 'undef' to those operands.
if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
if (MO.isUndef())
@@ -1891,6 +1901,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
}
}
+ // If DstInt already has a subrange for the unused lanes, then we shouldn't
+ // create duplicate subranges when we update the interval for unused lanes.
+ LaneBitmask DstIntLaneMask;
+ if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+ for (LiveInterval::SubRange &SR : DstInt->subranges())
+ DstIntLaneMask |= SR.LaneMask;
+ }
+
+ // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'.
SmallPtrSet<MachineInstr *, 8> Visited;
for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg),
E = MRI->reg_instr_end();
@@ -1914,6 +1933,80 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
+ bool RequiresImplicitRedef = false;
+ if (!SubregToRegSrcInsts.empty()) {
+ // We can only add an implicit-def and undef if the sub registers match,
+ // e.g.
+ // %0:gr32 = INSTX
+ // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined
+ // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32
+ //
+ // This cannot be transformed into:
+ // %1.sub32:gr64 = INSTX
+ // undef %1.sub8:gr64 = INSTY , implicit-def %1
+ //
+ // Because that would thrash the top 24 bits of %1.sub32.
+ if (is_contained(SubregToRegSrcInsts, UseMI) &&
+ all_of(UseMI->defs(),
+ [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool {
+ if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef())
+ return true;
+ return SubIdx == MO.getSubReg();
+ })) {
+ // Add implicit-def of super-register to express that the whole
+ // register is defined by the instruction.
+ MachineInstrBuilder MIB(*MF, UseMI);
+ MIB.addReg(DstReg, RegState::ImplicitDefine);
+ RequiresImplicitRedef = true;
+ }
+
+ // If the coalesed instruction doesn't fully define the register, we need
+ // to preserve the original super register liveness for SUBREG_TO_REG.
+ //
+ // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
+ // but it introduces liveness for other subregisters. Downstream users may
+ // have been relying on those bits, so we need to ensure their liveness is
+ // captured with a def of other lanes.
+ if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+ // First check if there is sufficient granularity in terms of subranges.
+ LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg());
+ LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+ LaneBitmask UnusedLanes = DstMask & ~UsedLanes;
+ if ((UnusedLanes & ~DstIntLaneMask).any()) {
+ BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+ DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt);
+ DstIntLaneMask |= UnusedLanes;
+ }
+
+ // After duplicating the live ranges for the low/hi bits, we
+ // need to update the subranges of the DstReg interval such that
+ // for a case like this:
+ //
+ // entry:
+ // 16B %1:gpr32 = INSTRUCTION (<=> UseMI)
+ // :
+ // if.then:
+ // 32B %1:gpr32 = MOVIMM32 ..
+ // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32
+ //
+ // Only the MOVIMM32 require a def of the top lanes and any intervals
+ // for the top 32-bits of the def at 16B should be removed.
+ for (LiveInterval::SubRange &SR : DstInt->subranges()) {
+ if (!Writes || RequiresImplicitRedef ||
+ (SR.LaneMask & UnusedLanes).none())
+ continue;
+
+ assert((SR.LaneMask & UnusedLanes) == SR.LaneMask &&
+ "Unexpected lanemask. Subrange needs finer granularity");
+
+ SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false);
+ auto SegmentI = SR.find(UseIdx);
+ if (SegmentI != SR.end())
+ SR.removeSegment(SegmentI, true);
+ }
+ }
+ }
+
// Replace SrcReg with DstReg in all UseMI operands.
for (unsigned Op : Ops) {
MachineOperand &MO = UseMI->getOperand(Op);
@@ -1922,7 +2015,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
// turn a full def into a read-modify-write sub-register def and vice
// versa.
if (SubIdx && MO.isDef())
- MO.setIsUndef(!Reads);
+ MO.setIsUndef(!Reads || RequiresImplicitRedef);
// A subreg use of a partially undef (super) register may be a complete
// undef use now and then has to be marked that way.
@@ -2025,6 +2118,30 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
LIS->shrinkToUses(&LI);
}
+/// For a given use of value \p Idx, it returns the def in the current block,
+/// or otherwise all possible defs in preceding blocks.
+static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks,
+ SmallVector<MachineInstr *> &Instrs,
+ LiveIntervals *LIS, LiveInterval &SrcInt,
+ MachineBasicBlock *MBB, VNInfo *Idx) {
+ if (!Idx->isPHIDef()) {
+ MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def);
+ assert(Def && "Unable to find a def for SUBREG_TO_REG source operand");
+ Instrs.push_back(Def);
+ return true;
+ }
+
+ bool Any = false;
+ if (VisitedBlocks.count(MBB))
+ return false;
+ VisitedBlocks.insert(MBB);
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred,
+ SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred)));
+ }
+ return Any;
+}
+
bool RegisterCoalescer::joinCopy(
MachineInstr *CopyMI, bool &Again,
SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
@@ -2156,6 +2273,35 @@ bool RegisterCoalescer::joinCopy(
});
}
+ SmallVector<MachineInstr *> SubregToRegSrcInsts;
+ if (CopyMI->isSubregToReg()) {
+ // For the case where the copy instruction is a SUBREG_TO_REG, e.g.
+ //
+ // %0:gpr32 = movimm32 ..
+ // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32
+ // ...
+ // %0:gpr32 = COPY <something>
+ //
+ // After joining liveranges, the original `movimm32` will need an
+ // implicit-def to make it explicit that the entire register is written,
+ // i.e.
+ //
+ // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0
+ // ...
+ // undef %0.sub32:gpr64 = COPY <something> // Note that this does not
+ // // require an implicit-def,
+ // // because it has nothing to
+ // // do with the SUBREG_TO_REG.
+ LiveInterval &SrcInt =
+ LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+ SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI);
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks;
+ if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt,
+ CopyMI->getParent(),
+ SrcInt.Query(SubregToRegSlotIdx).valueIn()))
+ llvm_unreachable("SUBREG_TO_REG src requires a def");
+ }
+
ShrinkMask = LaneBitmask::getNone();
ShrinkMainRange = false;
@@ -2225,9 +2371,12 @@ bool RegisterCoalescer::joinCopy(
// Rewrite all SrcReg operands to DstReg.
// Also update DstReg operands to include DstIdx if it is set.
- if (CP.getDstIdx())
+ if (CP.getDstIdx()) {
+ assert(SubregToRegSrcInsts.empty() && "can this happen?");
updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
- updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+ }
+ updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
+ SubregToRegSrcInsts);
// Shrink subregister ranges if necessary.
if (ShrinkMask.any()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3df434..a43020e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/SDPatternMatch.h"
@@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
}
}
- // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
- // than X, and the And doesn't change the lower iX bits, we can move the
- // AssertZext in front of the And and drop the AssertSext.
if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND &&
- N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext &&
isa<ConstantSDNode>(N0.getOperand(1))) {
- SDValue BigA = N0.getOperand(0);
- EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
const APInt &Mask = N0.getConstantOperandAPInt(1);
- if (AssertVT.bitsLT(BigA_AssertVT) &&
- Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
- SDLoc DL(N);
- SDValue NewAssert =
- DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
- return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
- N0.getOperand(1));
+
+ // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
+ // than X, and the And doesn't change the lower iX bits, we can move the
+ // AssertZext in front of the And and drop the AssertSext.
+ if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) {
+ SDValue BigA = N0.getOperand(0);
+ EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+ if (AssertVT.bitsLT(BigA_AssertVT) &&
+ Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
+ SDLoc DL(N);
+ SDValue NewAssert =
+ DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
+ return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
+ N0.getOperand(1));
+ }
}
+
+ // Remove AssertZext entirely if the mask guarantees the assertion cannot
+ // fail.
+ // TODO: Use KB countMinLeadingZeros to handle non-constant masks?
+ if (Mask.isIntN(AssertVT.getScalarSizeInBits()))
+ return N0;
}
return SDValue();
@@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
// If we store purely within object bounds just before its lifetime ends,
// we can remove the store.
- if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
- StoreSize.getFixedValue() * 8)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (LifetimeEndBase.contains(
+ DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8,
+ StoreBase, StoreSize.getFixedValue() * 8)) {
LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
dbgs() << "\nwithin LIFETIME_END of : ";
LifetimeEndBase.dump(); dbgs() << "\n");
@@ -28971,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
return SDValue();
}
+static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ // Match a pattern such as:
+ // (X | (X >> C0) | (X >> C1) | ...) & Mask
+ // This extracts contiguous parts of X and ORs them together before comparing.
+ // We can optimize this so that we directly check (X & SomeMask) instead,
+ // eliminating the shifts.
+
+ EVT VT = Root.getValueType();
+
+ // TODO: Support vectors?
+ if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N0 = Root.getOperand(0);
+ SDValue N1 = Root.getOperand(1);
+
+ if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
+ return SDValue();
+
+ APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
+
+ SDValue Src;
+ const auto IsSrc = [&](SDValue V) {
+ if (!Src) {
+ Src = V;
+ return true;
+ }
+
+ return Src == V;
+ };
+
+ SmallVector<SDValue> Worklist = {N0};
+ APInt PartsMask(VT.getSizeInBits(), 0);
+ while (!Worklist.empty()) {
+ SDValue V = Worklist.pop_back_val();
+ if (!V.hasOneUse() && (Src && Src != V))
+ return SDValue();
+
+ if (V.getOpcode() == ISD::OR) {
+ Worklist.push_back(V.getOperand(0));
+ Worklist.push_back(V.getOperand(1));
+ continue;
+ }
+
+ if (V.getOpcode() == ISD::SRL) {
+ SDValue ShiftSrc = V.getOperand(0);
+ SDValue ShiftAmt = V.getOperand(1);
+
+ if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
+ return SDValue();
+
+ auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal();
+ if (ShiftAmtVal > RootMask.getBitWidth())
+ return SDValue();
+
+ PartsMask |= (RootMask << ShiftAmtVal);
+ continue;
+ }
+
+ if (IsSrc(V)) {
+ PartsMask |= RootMask;
+ continue;
+ }
+
+ return SDValue();
+ }
+
+ if (!Src)
+ return SDValue();
+
+ SDLoc DL(Root);
+ return DAG.getNode(ISD::AND, DL, VT,
+ {Src, DAG.getConstant(PartsMask, DL, VT)});
+}
+
/// This is a stub for TargetLowering::SimplifySetCC.
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
bool foldBooleans) {
TargetLowering::DAGCombinerInfo
DagCombineInfo(DAG, Level, false, this);
- return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+ if (SDValue C =
+ TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
+ return C;
+
+ if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND &&
+ isNullConstant(N1)) {
+
+ if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
+ return DAG.getSetCC(DL, VT, Res, N1, Cond);
+ }
+
+ return SDValue();
}
/// Given an ISD::SDIV node expressing a divide by constant, return
@@ -29415,7 +29513,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
MachineMemOperand *MMO;
};
- auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
+ auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics {
if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
int64_t Offset = 0;
if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
@@ -29428,13 +29526,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
LSN->getBasePtr(), Offset /*base offset*/,
LocationSize::precise(Size), LSN->getMemOperand()};
}
- if (const auto *LN = cast<LifetimeSDNode>(N))
+ if (const auto *LN = cast<LifetimeSDNode>(N)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
return {false /*isVolatile*/,
/*isAtomic*/ false,
LN->getOperand(1),
0,
- LocationSize::precise(LN->getSize()),
+ LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())),
(MachineMemOperand *)nullptr};
+ }
// Default.
return {false /*isvolatile*/,
/*isAtomic*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74172b2..ba0ab23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
break;
case ISD::FP_TO_FP16:
LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
- if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+ if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) {
SDValue Op = Node->getOperand(0);
MVT SVT = Op.getSimpleValueType();
if ((SVT == MVT::f64 || SVT == MVT::f80) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 773ff48..02d1100 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
case ISD::TargetFrameIndex:
ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
break;
- case ISD::LIFETIME_START:
- case ISD::LIFETIME_END:
- ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
- break;
case ISD::PSEUDO_PROBE:
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
@@ -7847,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
}
- // Perform trivial constant folding.
- if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
- return SV;
+ if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) {
+ switch (Opcode) {
+ case ISD::XOR:
+ case ISD::ADD:
+ case ISD::PTRADD:
+ case ISD::SUB:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::UDIV:
+ case ISD::SDIV:
+ case ISD::UREM:
+ case ISD::SREM:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::UMIN:
+ case ISD::OR:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::UMAX:
+ case ISD::SMAX:
+ case ISD::SMIN:
+ // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison.
+ return N2.getOpcode() == ISD::POISON ? N2 : N1;
+ }
+ }
// Canonicalize an UNDEF to the RHS, even over a constant.
- if (N1.isUndef()) {
+ if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) {
if (TLI->isCommutativeBinOp(Opcode)) {
std::swap(N1, N2);
} else {
switch (Opcode) {
case ISD::PTRADD:
case ISD::SUB:
- // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(undef, non_undef_arg2) -> undef.
+ return N1;
case ISD::SIGN_EXTEND_INREG:
case ISD::UDIV:
case ISD::SDIV:
@@ -7868,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::SREM:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ // fold op(undef, non_undef_arg2) -> 0.
+ return getConstant(0, DL, VT);
}
}
}
// Fold a bunch of operators when the RHS is undef.
- if (N2.isUndef()) {
+ if (N2.getOpcode() == ISD::UNDEF) {
switch (Opcode) {
case ISD::XOR:
- if (N1.isUndef())
+ if (N1.getOpcode() == ISD::UNDEF)
// Handle undef ^ undef -> 0 special case. This is a common
// idiom (misuse).
return getConstant(0, DL, VT);
@@ -7887,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::ADD:
case ISD::PTRADD:
case ISD::SUB:
+ // fold op(arg1, undef) -> undef.
+ return N2;
case ISD::UDIV:
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
- // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(arg1, undef) -> poison.
+ return getPOISON(VT);
case ISD::MUL:
case ISD::AND:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ case ISD::UMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT);
case ISD::OR:
case ISD::SADDSAT:
case ISD::UADDSAT:
- // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) ->
- // poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getAllOnesConstant(DL, VT);
+ case ISD::UMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT);
+ case ISD::SMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL,
+ VT);
+ case ISD::SMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL,
+ VT);
}
}
+ // Perform trivial constant folding.
+ if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
+ return SV;
+
// Memoize this node if possible.
SDNode *N;
SDVTList VTs = getVTList(VT);
@@ -9360,8 +9397,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
}
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
- SDValue Chain, int FrameIndex,
- int64_t Size) {
+ SDValue Chain, int FrameIndex) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
@@ -9373,13 +9409,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
- ID.AddInteger(Size);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
- LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
- dl.getDebugLoc(), VTs, Size);
+ LifetimeSDNode *N =
+ newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1636465..306e068 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) {
// FPTrunc is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
SDLoc dl = getCurSDLoc();
+ SDNodeFlags Flags;
+ if (auto *TruncInst = dyn_cast<FPMathOperator>(&I))
+ Flags.copyFMF(*TruncInst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
DAG.getTargetConstant(
- 0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+ 0, dl, TLI.getPointerTy(DAG.getDataLayout())),
+ Flags));
}
void SelectionDAGBuilder::visitFPExt(const User &I) {
@@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
if (TM.getOptLevel() == CodeGenOptLevel::None)
return;
- const int64_t ObjectSize =
- cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
// First check that the Alloca is static, otherwise it won't have a
@@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
const int FrameIndex = SI->second;
- Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+ Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex);
DAG.setRoot(Res);
return;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9474587..900da76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
<< " -> "
<< ASC->getDestAddressSpace()
<< ']';
- } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
- OS << "<0 to " << LN->getSize() << ">";
} else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
OS << '<' << AA->getAlign().value() << '>';
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910..48d6b99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9471,7 +9471,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
ISD::SRL, DL, VT,
DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg),
DAG.getConstant(DeBruijn, DL, VT)),
- DAG.getConstant(ShiftAmt, DL, VT));
+ DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD));
SmallVector<uint8_t> Table(BitWidth, 0);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 68b8a00..3c91b0e 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2062,7 +2062,7 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
// FreeBSD has "__stack_chk_guard" defined externally on libc.so
if (M.getDirectAccessExternalData() &&
- !TM.getTargetTriple().isWindowsGNUEnvironment() &&
+ !TM.getTargetTriple().isOSCygMing() &&
!(TM.getTargetTriple().isPPC64() &&
TM.getTargetTriple().isOSFreeBSD()) &&
(!TM.getTargetTriple().isOSDarwin() ||
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 725e951..e9172f4 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1060,27 +1060,27 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
auto &Context = getContext();
if (Kind.isMergeableConst4() && MergeableConst4Section)
- return Context.getELFSection(".rodata.cst4." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst4." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 4);
if (Kind.isMergeableConst8() && MergeableConst8Section)
- return Context.getELFSection(".rodata.cst8." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst8." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 8);
if (Kind.isMergeableConst16() && MergeableConst16Section)
- return Context.getELFSection(".rodata.cst16." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst16." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 16);
if (Kind.isMergeableConst32() && MergeableConst32Section)
- return Context.getELFSection(".rodata.cst32." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst32." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 32);
if (Kind.isReadOnly())
- return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC);
+ return Context.getELFSection(".rodata." + SectionSuffix + ".",
+ ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
- return Context.getELFSection(".data.rel.ro." + SectionSuffix,
+ return Context.getELFSection(".data.rel.ro." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_WRITE);
}
diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
index 6267207..fd54190 100644
--- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
+++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
@@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable(
AddrOfOldGV, Twine("__ref_").concat(GV->getName()),
nullptr, GlobalVariable::NotThreadLocal);
+ // RefGV is created with isConstant = false, but we want to place RefGV into
+ // .rdata, not .data. It is important that the GlobalVariable be mutable
+ // from the compiler's point of view, so that the optimizer does not remove
+ // the global variable entirely and replace all references to it with its
+ // initial value.
+ //
+ // When the Windows hot-patch loader applies a hot-patch, it maps the
+ // pages of .rdata as read/write so that it can set each __ref_* variable
+ // to point to the original variable in the base image. Afterward, pages in
+ // .rdata are remapped as read-only. This protects the __ref_* variables from
+ // being overwritten during execution.
+ RefGV->setSection(".rdata");
+
// Create debug info for the replacement global variable.
DataLayout Layout = M->getDataLayout();
DIType *DebugType = DebugInfo.createPointerType(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index bd0d72f..0e95369 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -157,8 +157,7 @@ private:
processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
const MachOObjectFile &BaseObj,
ObjSectionToIDMap &ObjSectionToID) {
- const MachOObjectFile &Obj =
- static_cast<const MachOObjectFile&>(BaseObj);
+ const MachOObjectFile &Obj = BaseObj;
MachO::any_relocation_info RE =
Obj.getRelocation(RelI->getRawDataRefImpl());
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7928772..3aa4f7a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1161,7 +1161,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
Builder.restoreIP(AllocaIP);
auto *KernelArgsPtr =
Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
llvm::Value *Arg =
@@ -1189,7 +1189,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
if (!updateToLocation(Loc))
return Loc.IP;
- Builder.restoreIP(Loc.IP);
// On top of the arrays that were filled up, the target offloading call
// takes as arguments the device id as well as the host pointer. The host
// pointer is used by the runtime library to identify the current target
@@ -5955,7 +5954,7 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
Builder.restoreIP(AllocaIP);
AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
ArgsBase->setAlignment(Align(8));
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
// Store the index value with offset in depend vector.
for (unsigned I = 0; I < NumLoops; ++I) {
@@ -8081,7 +8080,7 @@ void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
".offload_ptrs");
AllocaInst *ArgSizes = Builder.CreateAlloca(
ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
MapperAllocas.ArgsBase = ArgsBase;
MapperAllocas.Args = Args;
MapperAllocas.ArgSizes = ArgSizes;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 28037d7..49c6dc7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
return CreateShuffleVector(V, Zeros, Name + ".splat");
}
-Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
- Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex,
- MDNode *DbgInfo) {
+Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops,
+ const Twine &Name) {
+ assert(Ops.size() >= 2 && Ops.size() <= 8 &&
+ "Unexpected number of operands to interleave");
+
+ // Make sure all operands are the same type.
+ assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type");
+
+#ifndef NDEBUG
+ for (unsigned I = 1; I < Ops.size(); I++) {
+ assert(Ops[I]->getType() == Ops[0]->getType() &&
+ "Vector interleave expects matching operand types!");
+ }
+#endif
+
+ unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size());
+ auto *SubvecTy = cast<VectorType>(Ops[0]->getType());
+ Type *DestTy = VectorType::get(SubvecTy->getElementType(),
+ SubvecTy->getElementCount() * Ops.size());
+ return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name);
+}
+
+Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base,
+ unsigned Dimension,
+ unsigned LastIndex,
+ MDNode *DbgInfo) {
auto *BaseType = Base->getType();
assert(isa<PointerType>(BaseType) &&
"Invalid Base ptr type for preserve.array.access.index.");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 6c35ade..58a1f74 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
"Shouldn't change the signature");
return NewDecl;
}
+
+struct InterleaveIntrinsic {
+ Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+ {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+ {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+ {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+ {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+ {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+ {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+ {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 0dbd07f..1157cbe 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const {
Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct);
Result.Scope = Info.lookup(LLVMContext::MD_alias_scope);
Result.NoAlias = Info.lookup(LLVMContext::MD_noalias);
+ Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace);
}
return Result;
}
@@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) {
setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct);
setMetadata(LLVMContext::MD_alias_scope, N.Scope);
setMetadata(LLVMContext::MD_noalias, N.NoAlias);
+ setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace);
}
void Instruction::setNoSanitizeMetadata() {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 73e79c0..7f07568 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/LTO/LTO.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StableHashing.h"
@@ -742,18 +743,18 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
Conf.VisibilityScheme = Config::ELF;
}
- const SymbolResolution *ResI = Res.begin();
- for (unsigned I = 0; I != Input->Mods.size(); ++I)
- if (Error Err = addModule(*Input, I, ResI, Res.end()))
+ for (unsigned I = 0; I != Input->Mods.size(); ++I) {
+ if (auto Err = addModule(*Input, I, Res).moveInto(Res))
return Err;
+ }
- assert(ResI == Res.end());
+ assert(Res.empty());
return Error::success();
}
-Error LTO::addModule(InputFile &Input, unsigned ModI,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
+Expected<ArrayRef<SymbolResolution>>
+LTO::addModule(InputFile &Input, unsigned ModI,
+ ArrayRef<SymbolResolution> Res) {
Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
if (!LTOInfo)
return LTOInfo.takeError();
@@ -782,28 +783,32 @@ Error LTO::addModule(InputFile &Input, unsigned ModI,
bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular);
auto ModSyms = Input.module_symbols(ModI);
- addModuleToGlobalRes(ModSyms, {ResI, ResE},
+ addModuleToGlobalRes(ModSyms, Res,
IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0,
LTOInfo->HasSummary);
if (IsThinLTO)
- return addThinLTO(BM, ModSyms, ResI, ResE);
+ return addThinLTO(BM, ModSyms, Res);
RegularLTO.EmptyCombinedModule = false;
- Expected<RegularLTOState::AddedModule> ModOrErr =
- addRegularLTO(BM, ModSyms, ResI, ResE);
+ auto ModOrErr = addRegularLTO(BM, ModSyms, Res);
if (!ModOrErr)
return ModOrErr.takeError();
+ Res = ModOrErr->second;
- if (!LTOInfo->HasSummary)
- return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false);
+ if (!LTOInfo->HasSummary) {
+ if (Error Err = linkRegularLTO(std::move(ModOrErr->first),
+ /*LivenessFromIndex=*/false))
+ return Err;
+ return Res;
+ }
// Regular LTO module summaries are added to a dummy module that represents
// the combined regular LTO module.
if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, ""))
return Err;
- RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr));
- return Error::success();
+ RegularLTO.ModsWithSummaries.push_back(std::move(ModOrErr->first));
+ return Res;
}
// Checks whether the given global value is in a non-prevailing comdat
@@ -839,10 +844,10 @@ handleNonPrevailingComdat(GlobalValue &GV,
// Add a regular LTO object to the link.
// The resulting module needs to be linked into the combined LTO module with
// linkRegularLTO.
-Expected<LTO::RegularLTOState::AddedModule>
+Expected<
+ std::pair<LTO::RegularLTOState::AddedModule, ArrayRef<SymbolResolution>>>
LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
+ ArrayRef<SymbolResolution> Res) {
RegularLTOState::AddedModule Mod;
Expected<std::unique_ptr<Module>> MOrErr =
BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true,
@@ -899,22 +904,22 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
std::set<const Comdat *> NonPrevailingComdats;
SmallSet<StringRef, 2> NonPrevailingAsmSymbols;
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResI != ResE);
- SymbolResolution Res = *ResI++;
+ assert(!Res.empty());
+ const SymbolResolution &R = Res.consume_front();
assert(MsymI != MsymE);
ModuleSymbolTable::Symbol Msym = *MsymI++;
Skip();
if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) {
- if (Res.Prevailing) {
+ if (R.Prevailing) {
if (Sym.isUndefined())
continue;
Mod.Keep.push_back(GV);
// For symbols re-defined with linker -wrap and -defsym options,
// set the linkage to weak to inhibit IPO. The linkage will be
// restored by the linker.
- if (Res.LinkerRedefined)
+ if (R.LinkerRedefined)
GV->setLinkage(GlobalValue::WeakAnyLinkage);
GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
@@ -938,7 +943,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
// Set the 'local' flag based on the linker resolution for this symbol.
- if (Res.FinalDefinitionInLinkageUnit) {
+ if (R.FinalDefinitionInLinkageUnit) {
GV->setDSOLocal(true);
if (GV->hasDLLImportStorageClass())
GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::
@@ -947,7 +952,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
} else if (auto *AS =
dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) {
// Collect non-prevailing symbols.
- if (!Res.Prevailing)
+ if (!R.Prevailing)
NonPrevailingAsmSymbols.insert(AS->first);
} else {
llvm_unreachable("unknown symbol type");
@@ -965,7 +970,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
CommonRes.Alignment =
std::max(Align(SymAlignValue), CommonRes.Alignment);
}
- CommonRes.Prevailing |= Res.Prevailing;
+ CommonRes.Prevailing |= R.Prevailing;
}
}
@@ -991,7 +996,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
assert(MsymI == MsymE);
- return std::move(Mod);
+ return std::make_pair(std::move(Mod), Res);
}
Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
@@ -1032,19 +1037,19 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
}
// Add a ThinLTO module to the link.
-Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
- const SymbolResolution *ResITmp = ResI;
+Expected<ArrayRef<SymbolResolution>>
+LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
+ ArrayRef<SymbolResolution> Res) {
+ ArrayRef<SymbolResolution> ResTmp = Res;
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResITmp != ResE);
- SymbolResolution Res = *ResITmp++;
+ assert(!ResTmp.empty());
+ const SymbolResolution &R = ResTmp.consume_front();
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
- if (Res.Prevailing)
+ if (R.Prevailing)
ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
}
}
@@ -1059,14 +1064,14 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResI != ResE);
- SymbolResolution Res = *ResI++;
+ assert(!Res.empty());
+ const SymbolResolution &R = Res.consume_front();
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
- if (Res.Prevailing) {
+ if (R.Prevailing) {
assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
BM.getModuleIdentifier());
@@ -1074,7 +1079,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
// switch the linkage to `weak` to prevent IPOs from happening.
// Find the summary in the module for this very GV and record the new
// linkage so that we can switch it when we import the GV.
- if (Res.LinkerRedefined)
+ if (R.LinkerRedefined)
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier()))
S->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1082,7 +1087,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
// If the linker resolved the symbol to a local definition then mark it
// as local in the summary for the module we are adding.
- if (Res.FinalDefinitionInLinkageUnit) {
+ if (R.FinalDefinitionInLinkageUnit) {
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier())) {
S->setDSOLocal(true);
@@ -1110,7 +1115,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
}
- return Error::success();
+ return Res;
}
unsigned LTO::getMaxTasks() const {
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 1074669..a214513 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() {
// For each entry, reserve space for 2 32-bit indices and a 64-bit count.
size_t SectionBytes =
W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
- (*CGProfileSection->begin()).appendContents(SectionBytes, 0);
+ (*CGProfileSection->begin())
+ .setVarContents(std::vector<char>(SectionBytes, 0));
}
MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() {
// (instead of emitting a zero-sized section) so these relocations are
// technically valid, even though we don't expect these relocations to
// actually be applied by the linker.
- Frag->appendContents(8, 0);
+ constexpr char zero[8] = {};
+ Frag->setVarContents(zero);
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 9c7b05b..e82393a 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -46,23 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
return nullptr;
}
+constexpr size_t FragBlockSize = 16384;
+// Ensure the new fragment can at least store a few bytes.
+constexpr size_t NewFragHeadroom = 8;
+
+static_assert(NewFragHeadroom >= alignof(MCFragment));
+static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom);
+
+MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) {
+ auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom);
+ FragSpace = Size - sizeof(MCFragment);
+ auto Chunk = std::unique_ptr<char[]>(new char[Size]);
+ auto *F = reinterpret_cast<MCFragment *>(Chunk.get());
+ FragStorage.push_back(std::move(Chunk));
+ return F;
+}
+
void MCObjectStreamer::newFragment() {
- addFragment(getContext().allocFragment<MCFragment>());
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ addFragment(F);
+}
+
+void MCObjectStreamer::ensureHeadroom(size_t Headroom) {
+ if (Headroom <= FragSpace)
+ return;
+ auto *F = allocFragSpace(Headroom);
+ new (F) MCFragment();
+ addFragment(F);
}
-void MCObjectStreamer::insert(MCFragment *F) {
- assert(F->getKind() != MCFragment::FT_Data &&
+void MCObjectStreamer::insert(MCFragment *Frag) {
+ assert(Frag->getKind() != MCFragment::FT_Data &&
"F should have a variable-size tail");
+ // Frag is not connected to FragSpace. Before modifying CurFrag with
+ // addFragment(Frag), allocate an empty fragment to maintain FragSpace
+ // connectivity, potentially reusing CurFrag's associated space.
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+
+ addFragment(Frag);
addFragment(F);
- newFragment();
+}
+
+void MCObjectStreamer::appendContents(ArrayRef<char> Contents) {
+ ensureHeadroom(Contents.size());
+ assert(FragSpace >= Contents.size());
+ llvm::copy(Contents, getCurFragEnd());
+ CurFrag->FixedSize += Contents.size();
+ FragSpace -= Contents.size();
}
void MCObjectStreamer::appendContents(size_t Num, char Elt) {
- CurFrag->appendContents(Num, Elt);
+ ensureHeadroom(Num);
+ MutableArrayRef<char> Data(getCurFragEnd(), Num);
+ llvm::fill(Data, Elt);
+ CurFrag->FixedSize += Num;
+ FragSpace -= Num;
}
void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
- CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+ CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind));
}
// As a compile-time optimization, avoid allocating and evaluating an MCExpr
@@ -111,6 +171,8 @@ void MCObjectStreamer::reset() {
}
EmitEHFrame = true;
EmitDebugFrame = false;
+ FragStorage.clear();
+ FragSpace = 0;
MCStreamer::reset();
}
@@ -139,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
SMLoc Loc) {
MCStreamer::emitValueImpl(Value, Size, Loc);
- MCFragment *DF = getCurrentFragment();
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -154,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
emitIntValue(AbsValue, Size);
return;
}
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- MCFixup::getDataKindForSize(Size)));
- DF->appendContents(Size, 0);
+ ensureHeadroom(Size);
+ addFixup(Value, MCFixup::getDataKindForSize(Size));
+ appendContents(Size, 0);
}
MCSymbol *MCObjectStreamer::emitCFILabel() {
@@ -190,7 +251,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// section.
MCFragment *F = CurFrag;
Symbol->setFragment(F);
- Symbol->setOffset(F->getContents().size());
+ Symbol->setOffset(F->getFixedSize());
emitPendingAssignments(Symbol);
}
@@ -256,6 +317,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
F0 = CurFrag;
}
+ // To maintain connectivity between CurFrag and FragSpace when CurFrag is
+ // modified, allocate an empty fragment and append it to the fragment list.
+ // (Subsections[I].second.Tail is not connected to FragSpace.)
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ F->setParent(Section);
+
auto &Subsections = Section->Subsections;
size_t I = 0, E = Subsections.size();
while (I != E && Subsections[I].first < Subsection)
@@ -263,13 +339,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
// If the subsection number is not in the sorted Subsections list, create a
// new fragment list.
if (I == E || Subsections[I].first != Subsection) {
- auto *F = getContext().allocFragment<MCFragment>();
- F->setParent(Section);
Subsections.insert(Subsections.begin() + I,
{Subsection, MCSection::FragList{F, F}});
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = F;
+ } else {
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = Subsections[I].second.Tail;
+ // Ensure CurFrag is associated with FragSpace.
+ addFragment(F);
}
- Section->CurFragList = &Subsections[I].second;
- CurFrag = Section->CurFragList->Tail;
// Define the section symbol at subsection 0's initial fragment if required.
if (!NewSec)
@@ -340,11 +419,15 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
MCFragment *F = getCurrentFragment();
// Append the instruction to the data fragment.
- size_t CodeOffset = F->getContents().size();
+ size_t CodeOffset = getCurFragSize();
+ SmallString<16> Content;
SmallVector<MCFixup, 1> Fixups;
- getAssembler().getEmitter().encodeInstruction(
- Inst, F->getContentsForAppending(), Fixups, STI);
- F->doneAppending();
+ getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI);
+ appendContents(Content);
+ if (CurFrag != F) {
+ F = CurFrag;
+ CodeOffset = 0;
+ }
F->setHasInstructions(STI);
if (Fixups.empty())
@@ -538,8 +621,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
void MCObjectStreamer::emitBytes(StringRef Data) {
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
- MCFragment *DF = getCurrentFragment();
- DF->appendContents(ArrayRef(Data.data(), Data.size()));
+ appendContents(ArrayRef(Data.data(), Data.size()));
}
void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 72a8dd7..a87648a 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// Emit the epilog instructions.
if (EnableUnwindV2) {
+ // Ensure the fixups and appended content apply to the same fragment.
+ OS->ensureHeadroom(info->EpilogMap.size() * 2);
+
bool IsLast = true;
for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
if (IsLast) {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 1ffe25c..8be5054 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -280,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
visitUsedSymbol(*Symbol);
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
+ ensureHeadroom(2);
addFixup(SRE, FK_SecRel_2);
appendContents(2, 0);
}
@@ -293,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_SecRel_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -308,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -318,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
// Create Symbol for section number.
const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
*Symbol, this->getWriter(), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -328,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
// Create Symbol for section offset.
const MCExpr *MCE =
MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 898ac5d..26f45ce 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -103,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
// Add a Fixup here to later record a relocation of type R_REF to prevent the
// ref symbol from being garbage collected (by the binder).
- MCFragment *DF = getCurrentFragment();
- const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
- std::optional<MCFixupKind> MaybeKind =
- getAssembler().getBackend().getFixupKind("R_REF");
- if (!MaybeKind)
- report_fatal_error("failed to get fixup kind for R_REF relocation");
-
- MCFixupKind Kind = *MaybeKind;
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind);
- DF->addFixup(Fixup);
+ addFixup(MCSymbolRefExpr::create(Symbol, getContext()),
+ XCOFF::RelocationType::R_REF);
}
void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 7b5c3c0..e87696a 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() {
}
MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
SectionKind::getMetadata());
- llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
+ llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data());
}
unsigned NumSections = Asm.end() - Asm.begin();
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
index 62a71d4..9b55f76 100644
--- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
// it is, find the target section unique id.
const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
const coff_aux_weak_external *WE = SymRef.getWeakExternal();
- if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+ if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) {
int32_t Index = SD->getNumber(IsBigObj);
if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
index 2d74d1d..5863490 100644
--- a/llvm/lib/Object/SFrameParser.cpp
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -10,27 +10,41 @@
#include "llvm/BinaryFormat/SFrame.h"
#include "llvm/Object/Error.h"
#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
using namespace llvm::object;
-template <typename T>
-static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
- uint64_t Offset) {
- static_assert(std::is_trivial_v<T>);
- if (Data.size() < Offset + sizeof(T)) {
+static Expected<ArrayRef<uint8_t>>
+getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) {
+ uint64_t End = SaturatingAdd(Offset, Size);
+ // Data.size() cannot be UINT64_MAX, as it would occupy the whole address
+ // space.
+ if (End > Data.size()) {
return createStringError(
formatv("unexpected end of data at offset {0:x} while reading [{1:x}, "
"{2:x})",
- Data.size(), Offset, Offset + sizeof(T))
+ Data.size(), Offset, End)
.str(),
object_error::unexpected_eof);
}
- return *reinterpret_cast<const T *>(Data.data() + Offset);
+ return Data.slice(Offset, Size);
+}
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+ uint64_t Offset) {
+ static_assert(std::is_trivial_v<T>);
+ Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T));
+ if (!Slice)
+ return Slice.takeError();
+
+ return *reinterpret_cast<const T *>(Slice->data());
}
template <endianness E>
-Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
+Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents,
+ uint64_t SectionAddress) {
Expected<const sframe::Preamble<E> &> Preamble =
getDataSliceAs<sframe::Preamble<E>>(Contents, 0);
if (!Preamble)
@@ -48,8 +62,44 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
getDataSliceAs<sframe::Header<E>>(Contents, 0);
if (!Header)
return Header.takeError();
- return SFrameParser(Contents, *Header);
+ return SFrameParser(Contents, SectionAddress, *Header);
+}
+
+template <endianness E>
+Expected<ArrayRef<uint8_t>> SFrameParser<E>::getAuxHeader() const {
+ return getDataSlice(Data, sizeof(Header), Header.AuxHdrLen);
+}
+
+template <endianness E>
+Expected<ArrayRef<sframe::FuncDescEntry<E>>> SFrameParser<E>::fdes() const {
+ Expected<ArrayRef<uint8_t>> Slice = getDataSlice(
+ Data, getFDEBase(), Header.NumFDEs * sizeof(sframe::FuncDescEntry<E>));
+ if (!Slice)
+ return Slice.takeError();
+ return ArrayRef(
+ reinterpret_cast<const sframe::FuncDescEntry<E> *>(Slice->data()),
+ Header.NumFDEs);
+}
+
+template <endianness E>
+uint64_t SFrameParser<E>::getAbsoluteStartAddress(
+ typename FDERange::iterator FDE) const {
+ uint64_t Result = SectionAddress + FDE->StartAddress;
+
+ if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) ==
+ sframe::Flags::FDEFuncStartPCRel) {
+ uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
+ uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+
+ assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
+ "Iterator does not belong to this object!");
+
+ Result += FDEPtr - DataPtr;
+ }
+
+ return Result;
}
-template class llvm::object::SFrameParser<endianness::big>;
-template class llvm::object::SFrameParser<endianness::little>;
+template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>;
+template class LLVM_EXPORT_TEMPLATE
+ llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 277247e..cc02cae 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
size_t Size = Buf.size();
#endif
ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size);
- if (ssize_t(NumRead) == -1)
+ if (NumRead == -1)
return errorCodeToError(errnoAsErrorCode());
// The underlying operation on these platforms allow opening directories
// for reading in more cases than other platforms.
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index d862dbd..8dd7c88 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
Name.clear();
}
+namespace llvm::sys::windows {
+HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) {
+ // Ensure we load indeed a module from system32 path.
+ // As per GetModuleHandle documentation:
+ // "If lpModuleName does not include a path and there is more than one loaded
+ // module with the same base name and extension, you cannot predict which
+ // module handle will be returned.". This mitigates
+ // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637
+ SmallVector<wchar_t, MAX_PATH> Buf;
+ size_t Size = MAX_PATH;
+ do {
+ Buf.resize_for_overwrite(Size);
+ SetLastError(NO_ERROR);
+ Size = ::GetSystemDirectoryW(Buf.data(), Buf.size());
+ if (Size == 0)
+ return NULL;
+
+ // Try again with larger buffer.
+ } while (Size > Buf.size());
+
+ Buf.truncate(Size);
+ Buf.push_back(L'\\');
+ Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName));
+ Buf.push_back(0);
+
+ return ::GetModuleHandleW(Buf.data());
+}
+} // namespace llvm::sys::windows
+
SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
+ HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll");
+ if (kernelM) {
+ // SetThreadInformation is only available on Windows 8 and later. Since we
+ // still support compilation on Windows 7, we load the function dynamically.
+ typedef BOOL(WINAPI * SetThreadInformation_t)(
+ HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass,
+ _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation,
+ ULONG ThreadInformationSize);
+ static const auto pfnSetThreadInformation =
+ (SetThreadInformation_t)::GetProcAddress(kernelM,
+ "SetThreadInformation");
+ if (pfnSetThreadInformation) {
+ auto setThreadInformation = [](ULONG ControlMaskAndStateMask) {
+ THREAD_POWER_THROTTLING_STATE state{};
+ state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+ state.ControlMask = ControlMaskAndStateMask;
+ state.StateMask = ControlMaskAndStateMask;
+ return pfnSetThreadInformation(
+ ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state));
+ };
+
+ // Use EcoQoS for ThreadPriority::Background available (running on most
+ // efficent cores at the most efficient cpu frequency):
+ // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation
+ // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service
+ setThreadInformation(Priority == ThreadPriority::Background
+ ? THREAD_POWER_THROTTLING_EXECUTION_SPEED
+ : 0);
+ }
+ }
+
// https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
// Begin background processing mode. The system lowers the resource scheduling
// priorities of the thread so that it can perform background work without
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 1f3e5dc..3f318e2 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const {
}
break;
+ case GETDAGOPNAME:
+ if (const auto *Dag = dyn_cast<DagInit>(LHS)) {
+ return Dag->getName();
+ }
+ break;
+
case LOG2:
if (const auto *LHSi = dyn_cast_or_null<IntInit>(
LHS->convertInitializerTo(IntRecTy::get(RK)))) {
@@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const {
case SIZE: Result = "!size"; break;
case EMPTY: Result = "!empty"; break;
case GETDAGOP: Result = "!getdagop"; break;
+ case GETDAGOPNAME:
+ Result = "!getdagopname";
+ break;
case LOG2 : Result = "!logtwo"; break;
case LISTFLATTEN:
Result = "!listflatten";
@@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
SmallVector<std::pair<const Init *, const StringInit *>, 8> Args;
llvm::append_range(Args, LHSs->getArgAndNames());
llvm::append_range(Args, RHSs->getArgAndNames());
- return DagInit::get(Op, Args);
+ // Use the name of the LHS DAG if it's set, otherwise the name of the RHS.
+ const auto *NameInit = LHSs->getName();
+ if (!NameInit)
+ NameInit = RHSs->getName();
+ return DagInit::get(Op, NameInit, Args);
}
break;
}
@@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames());
break;
}
+ case SETDAGOPNAME: {
+ const auto *Dag = dyn_cast<DagInit>(LHS);
+ const auto *Op = dyn_cast<StringInit>(RHS);
+ if (Dag && Op)
+ return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(),
+ Dag->getArgNames());
+ break;
+ }
case ADD:
case SUB:
case MUL:
@@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const {
case STRCONCAT: Result = "!strconcat"; break;
case INTERLEAVE: Result = "!interleave"; break;
case SETDAGOP: Result = "!setdagop"; break;
+ case SETDAGOPNAME:
+ Result = "!setdagopname";
+ break;
case GETDAGARG:
Result = "!getdagarg<" + getType()->getAsString() + ">";
break;
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index aea1bb0..c369916 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() {
.Case("find", tgtok::XFind)
.Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
.Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+ .Case("setdagopname", tgtok::XSetDagOpName)
+ .Case("getdagopname", tgtok::XGetDagOpName)
.Case("getdagarg", tgtok::XGetDagArg)
.Case("getdagname", tgtok::XGetDagName)
.Case("setdagarg", tgtok::XSetDagArg)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index ed7d8f3..5725e39 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -150,6 +150,8 @@ enum TokKind {
XGt,
XSetDagOp,
XGetDagOp,
+ XSetDagOpName,
+ XGetDagOpName,
XExists,
XListRemove,
XToLower,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 62c5355..81b61b1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "TGParser.h"
+#include "TGLexer.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
@@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XCast:
case tgtok::XRepr:
case tgtok::XGetDagOp:
+ case tgtok::XGetDagOpName:
case tgtok::XInitialized: { // Value ::= !unop '(' Value ')'
UnOpInit::UnaryOp Code;
const RecTy *Type = nullptr;
@@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
Code = UnOpInit::GETDAGOP;
break;
+ case tgtok::XGetDagOpName:
+ Lex.Lex(); // eat the operation
+ Type = StringRecTy::get(Records);
+ Code = UnOpInit::GETDAGOPNAME;
+ break;
case tgtok::XInitialized:
Lex.Lex(); // eat the operation
Code = UnOpInit::INITIALIZED;
@@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XInterleave:
case tgtok::XGetDagArg:
case tgtok::XGetDagName:
- case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')'
+ case tgtok::XSetDagOp:
+ case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')'
tgtok::TokKind OpTok = Lex.getCode();
SMLoc OpLoc = Lex.getLoc();
Lex.Lex(); // eat the operation
@@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break;
case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break;
case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break;
+ case tgtok::XSetDagOpName:
+ Code = BinOpInit::SETDAGOPNAME;
+ break;
case tgtok::XGetDagArg:
Code = BinOpInit::GETDAGARG;
break;
@@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
ArgType = DagRecTy::get(Records);
break;
+ case tgtok::XSetDagOpName:
+ Type = DagRecTy::get(Records);
+ ArgType = DagRecTy::get(Records);
+ break;
case tgtok::XGetDagName:
Type = StringRecTy::get(Records);
ArgType = DagRecTy::get(Records);
@@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
// Deal with BinOps whose arguments have different types, by
// rewriting ArgType in between them.
switch (Code) {
- case BinOpInit::SETDAGOP:
- // After parsing the first dag argument, switch to expecting
- // a record, with no restriction on its superclasses.
- ArgType = RecordRecTy::get(Records, {});
- break;
- case BinOpInit::GETDAGARG:
- // After parsing the first dag argument, expect an index integer or a
- // name string.
- ArgType = nullptr;
- break;
- case BinOpInit::GETDAGNAME:
- // After parsing the first dag argument, expect an index integer.
- ArgType = IntRecTy::get(Records);
- break;
- default:
- break;
+ case BinOpInit::SETDAGOPNAME:
+ // After parsing the first dag argument, expect a string.
+ ArgType = StringRecTy::get(Records);
+ break;
+ case BinOpInit::SETDAGOP:
+ // After parsing the first dag argument, switch to expecting
+ // a record, with no restriction on its superclasses.
+ ArgType = RecordRecTy::get(Records, {});
+ break;
+ case BinOpInit::GETDAGARG:
+ // After parsing the first dag argument, expect an index integer or a
+ // name string.
+ ArgType = nullptr;
+ break;
+ case BinOpInit::GETDAGNAME:
+ // After parsing the first dag argument, expect an index integer.
+ ArgType = IntRecTy::get(Records);
+ break;
+ default:
+ break;
}
if (!consume(tgtok::comma))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index eca7ca5..ad42f4b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 0,
AArch64::LDNT1B_2Z_IMM_PSEUDO,
AArch64::LDNT1B_2Z_PSEUDO);
@@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 1,
AArch64::LDNT1H_2Z_IMM_PSEUDO,
AArch64::LDNT1H_2Z_PSEUDO);
@@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 2,
AArch64::LDNT1W_2Z_IMM_PSEUDO,
AArch64::LDNT1W_2Z_PSEUDO);
@@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 3,
AArch64::LDNT1D_2Z_IMM_PSEUDO,
AArch64::LDNT1D_2Z_PSEUDO);
@@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 0,
AArch64::LDNT1B_4Z_IMM_PSEUDO,
AArch64::LDNT1B_4Z_PSEUDO);
@@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 1,
AArch64::LDNT1H_4Z_IMM_PSEUDO,
AArch64::LDNT1H_4Z_PSEUDO);
@@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 2,
AArch64::LDNT1W_4Z_IMM_PSEUDO,
AArch64::LDNT1W_4Z_PSEUDO);
@@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 3,
AArch64::LDNT1D_4Z_IMM_PSEUDO,
AArch64::LDNT1D_4Z_PSEUDO);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b49754..4fef93c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11325,7 +11325,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
SDValue AArch64TargetLowering::LowerSELECT_CC(
ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
- iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+ iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,
const SDLoc &DL, SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -11386,6 +11386,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
+ // Canonicalise absolute difference patterns:
+ // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
+ // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
+ //
+ // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
+ // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
+ // The second forms can be matched into subs+cneg.
+ if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
+ if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
+ FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS)
+ FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
+ else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
+ FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS)
+ TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
+ }
+
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
@@ -11523,7 +11539,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return true;
}
})) {
- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
SDValue VectorCmp =
emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
if (VectorCmp)
@@ -11537,7 +11553,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
- if (DAG.getTarget().Options.UnsafeFPMath) {
+ if (Flags.hasNoSignedZeros()) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
@@ -11616,10 +11632,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
+ SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
- return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
- DAG);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11627,7 +11642,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
SDLoc DL(Op);
EVT Ty = Op.getValueType();
@@ -11694,8 +11708,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
DAG.getUNDEF(MVT::f32), FVal);
}
- SDValue Res =
- LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
+ SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
+ Op->getFlags(), DL, DAG);
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -12292,7 +12306,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
- SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
+ // Ensure nodes can be recognized by isAssociativeAndCommutative.
+ SDNodeFlags Flags =
+ SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -16674,7 +16690,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath));
+ I->getFastMathFlags().allowContract()));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -24112,6 +24128,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}
+// Combine store (fp_to_int X) to use vector semantics around the conversion
+// when NEON is available. This allows us to store the in-vector result directly
+// without transferring the result into a GPR in the process.
+static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // Limit to post-legalization in order to avoid peeling truncating stores.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+ // Source operand is already a vector.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType().isVector())
+ return SDValue();
+
+ // Look through potential assertions.
+ while (Value->isAssert())
+ Value = Value.getOperand(0);
+
+ if (Value.getOpcode() != ISD::FP_TO_SINT &&
+ Value.getOpcode() != ISD::FP_TO_UINT)
+ return SDValue();
+ if (!Value->hasOneUse())
+ return SDValue();
+
+ SDValue FPSrc = Value.getOperand(0);
+ EVT SrcVT = FPSrc.getValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // No support for assignments such as i64 = fp_to_sint i32
+ EVT VT = Value.getSimpleValueType();
+ if (VT != SrcVT.changeTypeToInteger())
+ return SDValue();
+
+ // Create a 128-bit element vector to avoid widening. The floating point
+ // conversion is transformed into a single element conversion via a pattern.
+ unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
+ EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
+ EVT VecDstVT = VecSrcVT.changeTypeToInteger();
+ SDLoc DL(ST);
+ SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
+ SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
+
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
+
+ DCI.CombineTo(ST->getValue().getNode(), Extracted);
+ return SDValue(ST, 0);
+}
+
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24194,6 +24264,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);
+ if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
+ return Res;
+
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
@@ -26926,6 +26999,23 @@ static SDValue performSHLCombine(SDNode *N,
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
}
+static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
+ unsigned IntrinsicID = N->getConstantOperandVal(1);
+ auto Register =
+ (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
+ : AArch64SysReg::RNDRRS);
+ SDLoc DL(N);
+ SDValue A = DAG.getNode(
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
+ N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
+ SDValue B = DAG.getNode(
+ AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
+ return DAG.getMergeValues(
+ {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -27241,22 +27331,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
case Intrinsic::aarch64_rndr:
- case Intrinsic::aarch64_rndrrs: {
- unsigned IntrinsicID = N->getConstantOperandVal(1);
- auto Register =
- (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
- : AArch64SysReg::RNDRRS);
- SDLoc DL(N);
- SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
- N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
- SDValue B = DAG.getNode(
- AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
- return DAG.getMergeValues(
- {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
- }
+ case Intrinsic::aarch64_rndrrs:
+ return performRNDRCombine(N, DAG);
case Intrinsic::aarch64_sme_ldr_zt:
return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
DAG.getVTList(MVT::Other), N->getOperand(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 95d0e3b..ea63edd8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -662,7 +662,7 @@ private:
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
SDValue TVal, SDValue FVal,
iterator_range<SDNode::user_iterator> Users,
- bool HasNoNans, const SDLoc &dl,
+ SDNodeFlags Flags, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8685d7a0..59d4fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
// the target options or if FADD/FSUB has the contract fast-math flag.
- return Options.UnsafeFPMath ||
- Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ return Options.AllowFPOpFusion == FPOpFusion::Fast ||
Inst.getFlag(MachineInstr::FmContract);
- return true;
}
return false;
}
@@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
case AArch64::FMUL_ZZZ_H:
case AArch64::FMUL_ZZZ_S:
case AArch64::FMUL_ZZZ_D:
- return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
- (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
- Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
// == Integer types ==
// -- Base instructions --
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07cacfa..251fd44 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
+def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
+
// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index abcd550..b97d622 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -12,7 +12,7 @@
// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
-// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
//
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
template <typename T>
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+ // Strategy used to split logical immediate bitmasks.
+ enum class SplitStrategy {
+ Intersect,
+ };
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
+ bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
- if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
- return false;
-
- // If this immediate can be handled by one instruction, do not split it.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
- if (Insn.size() == 1)
- return false;
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
- unsigned OtherOpc) {
+bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy,
+ unsigned OtherOpc) {
// Try below transformation.
//
// MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
@@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
return splitTwoPartImm<T>(
MI,
- [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
- if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+ [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
+ // If this immediate is already a suitable bitmask, don't split it.
+ // TODO: Should we just combine the two instructions in this case?
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return std::nullopt;
+
+ // If this immediate can be handled by one instruction, don't split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return std::nullopt;
+
+ bool SplitSucc = false;
+ switch (Strategy) {
+ case SplitStrategy::Intersect:
+ SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ }
+ if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
@@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= visitINSERT(MI);
break;
case AArch64::ANDWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDSWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ Changed |= trySplitLogicalImm<uint32_t>(
+ AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
break;
case AArch64::ANDSXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ Changed |= trySplitLogicalImm<uint64_t>(
+ AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index bb0f667b..e0e1af7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
};
+ auto LowerTriOp = [&MI, &MIB](unsigned Opcode) {
+ MIB.buildInstr(Opcode, {MI.getOperand(0)},
+ {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)});
+ MI.eraseFromParent();
+ return true;
+ };
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
@@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_USUBSAT);
break;
}
+ case Intrinsic::aarch64_neon_udot:
+ return LowerTriOp(AArch64::G_UDOT);
+ case Intrinsic::aarch64_neon_sdot:
+ return LowerTriOp(AArch64::G_SDOT);
case Intrinsic::vector_reverse:
// TODO: Add support for vector_reverse
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8b8fc8b..071c940 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
"VMEM CU scope prefetches do not fail on illegal address"
>;
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+ "HasCUStores",
+ "true",
+ "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -1383,6 +1389,9 @@ def FeatureAddSubU64Insts
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
"Has v_add_u64 and v_sub_u64 instructions">;
+def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
+ "true", "Has v_mad_u32 instruction">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1988,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureCUStores,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
@@ -2042,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
+ FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2422,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts :
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
@@ -2832,6 +2843,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
+ AssemblerPredicate<(all_of FeatureMadU32Inst)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc37..6681393 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ if (ST.isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (isGFX1250(ST) && ST.hasCUStores()) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+ }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index c01e5d3..992572f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
+def gi_global_saddr_no_ioffset :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index dfaa145..39b4200 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
unsigned Opc;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
if (Subtarget->hasMADIntraFwdBug())
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
else
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
Clamp };
+
+ if (UseNoCarry) {
+ MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
+ ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
@@ -2049,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5636d89..983f1aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -174,6 +174,8 @@ private:
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 266dee1..fab99f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
+ MRI->use_nodbg_empty(I.getOperand(1).getReg());
unsigned Opc;
if (Subtarget->hasMADIntraFwdBug())
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
: AMDGPU::V_MAD_I64_I32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
+ : AMDGPU::V_MAD_NC_I64_I32_e64;
else
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+
+ if (UseNoCarry)
+ I.removeOperand(1);
+
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
@@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
}
unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+ if (!IsB32 && STI.hasTrue16BitInsts())
+ Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
+ : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
unsigned CBL = STI.getConstantBusLimit(Opc);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -5789,6 +5801,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
@@ -6971,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
: (int64_t)SISrcMods::DST_OP_SEL);
}
@@ -6986,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
? (int64_t)(SISrcMods::OP_SEL_0)
: 0);
}
@@ -7015,14 +7038,15 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
+ ? (int64_t)SISrcMods::DST_OP_SEL
+ : 0);
}
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index fe9743d0a..140e753 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -264,6 +264,8 @@ private:
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f..50da8fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
- .widenScalarToNextPow2(0)
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder(G_ABS)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ if (ST.hasIntMinMax64()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, S64, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ }
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
@@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
- if (ST.hasGFX90AInsts()) {
+ if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
@@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
LLT::scalar(32), commonAlignment(Align(64), Offset));
// Pointer address
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
- B.buildPtrAdd(LoadAddr, QueuePtr,
- B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
+ B.buildObjectPtrOffset(
+ LoadAddr, QueuePtr,
+ B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
llvm_unreachable("failed to find kernarg segment ptr");
auto COffset = B.buildConstant(LLT::scalar(64), Offset);
- // TODO: Should get nuw
- return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
+ return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
}
/// Legalize a value that's loaded from kernel arguments. This is only used by
@@ -5676,8 +5693,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
return false;
- // FIXME: This should be nuw
- B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ B.buildObjectPtrOffset(DstReg, KernargPtrReg,
+ B.buildConstant(IdxTy, Offset).getReg(0));
return true;
}
@@ -7019,8 +7036,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
// Pointer address
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
B.buildCopy(SGPR01, Temp);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index f471881..b45627d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
BasePlusOffset = Base;
} else {
auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
- BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+ BasePlusOffset =
+ B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
}
auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c5a1d9e..6c40eb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX:
case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
@@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
+ if (isSALUMapping(MI)) {
+ // There are no scalar 64-bit min and max, use vector instruction instead.
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&
+ Subtarget.hasIntMinMax64())
+ return getDefaultMappingVOP(MI);
+ return getDefaultMappingSOP(MI);
+ }
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
@@ -5364,6 +5372,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 38f9ee5..c1f1703 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -104,7 +104,9 @@
#include "llvm/Transforms/Scalar/FlattenCFG.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/LICM.h"
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/Transforms/Scalar/Sink.h"
@@ -2066,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
// TODO: May want to move later or split into an early and late one.
addPass(AMDGPUCodeGenPreparePass(TM));
- // TODO: LICM
+ // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
+ // have expanded.
+ if (TM.getOptLevel() > CodeGenOptLevel::Less) {
+ addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
+ /*UseMemorySSA=*/true));
+ }
}
Base::addIRPasses(addPass);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc42..44e65b3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
+ } else if (ID == ".amdhsa_uses_cu_stores") {
+ if (!isGFX1250())
+ return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f99e716..1956a15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
}
//===----------------------------------------------------------------------===//
-// MUBUF - GFX11, GFX12.
+// MUBUF - GFX11, GFX12, GFX1250.
//===----------------------------------------------------------------------===//
// gfx11 instruction that accept both old and new assembler name.
@@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op,
def : Mnem_gfx12<gfx11_name, gfx12_name>;
}
+multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>,
+ MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> {
+ def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>;
+}
+
defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>;
defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>;
@@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
+defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>;
+defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">;
+defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">;
+
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 42edec0..c466f9c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen
Instrumentation
MC
MIRParser
+ ObjCARC
Passes
Scalar
SelectionDAG
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 319cc9d..3ff675d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
+defm DS_ADD_F64 : DS_Real_gfx12<0x054>;
+defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>;
+
let AssemblerPredicate = HasLdsBarrierArriveAtomic in {
defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>;
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b..ffe6b06 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (isGFX1250())
+ PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+ KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0f172e0d..d5d1074 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,6 +11,7 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
+ def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
@@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
+class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $vaddr, $dsaddr, $offset, $cpol)
+>;
+
+class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
@@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatStoreLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
@@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly
+let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
+
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -3435,6 +3488,14 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
+defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
+
+defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">;
+defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">;
+
def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 785ede3..fba1c5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -248,6 +248,7 @@ protected:
bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
bool HasSafeCUPrefetch = false;
+ bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -272,6 +273,7 @@ protected:
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
+ bool HasMadU32Inst = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -714,7 +716,9 @@ public:
bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
// DS_ADD_F64/DS_ADD_RTN_F64
- bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
+ bool hasLdsAtomicAddF64() const {
+ return hasGFX90AInsts() || hasGFX1250Insts();
+ }
bool hasMultiDwordFlatScratchAddressing() const {
return getGeneration() >= GFX9;
@@ -998,6 +1002,8 @@ public:
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+ bool hasCUStores() const { return HasCUStores; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1516,9 +1522,19 @@ public:
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+ // \returns true if the target has V_MAD_U32 instruction.
+ bool hasMadU32Inst() const { return HasMadU32Inst; }
+
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
bool hasVectorMulU64() const { return GFX1250Insts; }
+ // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
+ // instructions.
+ bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+ bool hasIntMinMax64() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d33..43ca548 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
+ if (isGFX1250(STI))
+ PrintField(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+ ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9017f4f..f4d7408 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
+ if (Subtarget->hasIntMinMax64())
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,
+ Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1256,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+static unsigned getIntrMemWidth(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ return 8;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ return 32;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ return 64;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ return 128;
+ default:
+ llvm_unreachable("Unknown width");
+ }
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1527,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
@@ -1623,10 +1666,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -4241,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
if (Alignment > StackAlign) {
- uint64_t ScaledAlignment = (uint64_t)Alignment.value()
+ uint64_t ScaledAlignment = Alignment.value()
<< Subtarget->getWavefrontSizeLog2();
uint64_t StackAlignMask = ScaledAlignment - 1;
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 520c321..4b48fc4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_lds_direct ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2aa6b4e..c2da937 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9281,6 +9281,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
default:
if (MI.isMetaInstruction())
return 0;
+
+ // If D16 Pseudo inst, get correct MC code size
+ const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
+ if (D16Info) {
+ // Assume d16_lo/hi inst are always in same size
+ unsigned LoInstOpcode = D16Info->LoOp;
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
+ DescSize = Desc.getSize();
+ }
+
return DescSize;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f1262e11..53f554e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2564,7 +2584,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
- if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
+ // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+ if (Scope == CPol::SCOPE_CU &&
+ (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
return false;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc..8303410 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index b6f9568..9f1e53d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let HasExtDPP = 0;
}
-let HasExt64BitDPP = 1 in {
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
@@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
+let HasExt64BitDPP = 1 in {
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+
class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
@@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
-
-def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> {
let HasExtVOP3DPP = 0;
- let HasExtDPP = 0;
+ let HasExt64BitDPP = 1;
+}
+def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>;
+def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> {
+ let HasClamp = 1;
}
} // End HasExt64BitDPP = 1;
@@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SchedRW = [WriteIntMul] in {
+ let SubtargetPredicate = HasMadU32Inst in
+ defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>;
+ let SubtargetPredicate = isGFX1250Plus in {
+ defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>;
+ defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>;
+ }
+}
+
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
@@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
+defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>;
+defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>;
+defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>;
+defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>;
+} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd]
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -848,6 +872,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in
+ def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>;
+
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
@@ -972,10 +999,10 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
unsigned Val = N->getZExtValue();
unsigned New = 0;
if (}] # modifier_idx # [{ == 0) {
- New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
- : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
+ New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
+ : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
} else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
- New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
}
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
@@ -1019,7 +1046,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">,
def DstSelToOpSel3XForm : SDNodeXForm<timm, [{
uint32_t V = N->getZExtValue();
return CurDAG->getTargetConstant(
- (V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
+ (V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
SDLoc(N), MVT::i32);
}]>;
def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">,
@@ -1427,34 +1454,72 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
-let SubtargetPredicate = HasBitOp3Insts in {
+let HasClamp = 0, HasModifiers = 1 in {
+def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
+def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
+def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>;
+}
+
+let OtherPredicates = [HasBitOp3Insts] in {
let isReMaterializable = 1 in {
- defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
- VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>;
+ let SubtargetPredicate = isGFX940Plus in
+ defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>;
+ let SubtargetPredicate = isGFX1250Plus in
+ defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile,
+ BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>;
defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>,
VOPD_Component<0x12, "v_bitop2_b32">;
}
+
def : GCNPat<
(i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
def : GCNPat<
- (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-
- def : GCNPat<
(i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
- def : GCNPat<
- (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-} // End SubtargetPredicate = HasBitOp3Insts
+ let SubtargetPredicate = isGFX940Plus in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ } // End SubtargetPredicate = isGFX940Plus
+
+ let SubtargetPredicate = isGFX1250Plus in {
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ } // End SubtargetPredicate = isGFX1250Plus
+
+} // End OtherPredicates = [HasBitOp3Insts]
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
(AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
@@ -1746,6 +1811,17 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
+defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">;
+defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>;
+
+defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
+defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
+defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
+defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>;
+defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>;
+defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>;
+defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>;
+
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c21e2d3..badbba9 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0);
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
@@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
let Inst{14 - 8} = sdst;
}
+class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName>
+ : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
+class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName>
+ : Base_VOP3_DPP8_t16<op, p, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: Base_VOP3_DPP8<op, ps, opName> {
bits<8> sdst;
@@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple<
multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
VOP3be_Realtriple<Gen, op, 1>;
+multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> {
+ def _e64_dpp#Gen.Suffix :
+ VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>;
+}
+
+multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+ def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> {
+ let DecoderNamespace =
+ Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ }
+}
+
+multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3a_BITOP3_gfx12<op, ps.Pfl>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// VOP3 GFX11
//===----------------------------------------------------------------------===//
@@ -2046,6 +2113,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+ VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>;
+
+multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> {
+ defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>;
+ defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>;
+}
+
multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0,
string opName = NAME> :
VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 868556b..6dfe846 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
// Add the R_ARM_NONE fixup at the same position
void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+ visitUsedSymbol(*PersonalitySym);
const MCSymbolRefExpr *PersonalityRef =
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
-
- visitUsedExpr(*PersonalityRef);
- MCFragment *DF = getCurrentFragment();
- DF->addFixup(
- MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
+ addFixup(PersonalityRef, FK_Data_4);
}
void ARMELFStreamer::FlushPendingOffset() {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index d96136c..a5bf0e5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2621,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue
LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
- if (isa<ConstantSDNode>(Op->getOperand(2)))
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+
+ if (isa<ConstantSDNode>(Op2))
return Op;
- return SDValue();
+
+ MVT IdxTy = MVT::getIntegerVT(EltSizeInBits);
+ MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts);
+
+ if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy))
+ return SDValue();
+
+ SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1);
+ SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2);
+
+ SmallVector<SDValue, 32> RawIndices;
+ for (unsigned i = 0; i < NumElts; ++i)
+ RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
+ SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices);
+
+ // insert vec, elt, idx
+ // =>
+ // select (splatidx == {0,1,2...}) ? splatelt : vec
+ SDValue SelectCC =
+ DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ);
+ return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0);
}
SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index d9680c7..7a8395a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
// fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(8, 0);
@@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_TPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_TPREL64);
S.appendContents(8, 0);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 96f52275..95abcde 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
CodeGenOptLevel OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm) {
- doMulWide = (OptLevel > CodeGenOptLevel::None);
-}
+ : SelectionDAGISel(tm, OptLevel), TM(tm) {}
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e504a8f..9e0f88e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -40,9 +40,6 @@ private:
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
- // If true, generate mul.wide from sext and mul
- bool doMulWide;
-
NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
bool usePrecSqrtF32(const SDNode *N) const;
bool useF32FTZ() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f79b862..4fd3623 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
- ISD::STORE});
+ ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5219,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N,
return SDValue();
}
+// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
+static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.hasOneUse())
+ return SDValue();
+ EVT ToVT = N->getValueType(0);
+ EVT FromVT = Op.getValueType();
+ if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
+ (ToVT == MVT::i64 && FromVT == MVT::i32)))
+ return SDValue();
+ if (!(Op.getOpcode() == ISD::MUL ||
+ (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned ExtOpcode = N->getOpcode();
+ unsigned Opcode = 0;
+ if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_SIGNED;
+ else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_UNSIGNED;
+ else
+ return SDValue();
+ SDValue RHS = Op.getOperand(1);
+ if (Op.getOpcode() == ISD::SHL) {
+ const auto ShiftAmt = Op.getConstantOperandVal(1);
+ const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
+ }
+ return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
+}
+
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -5825,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return combineADDRSPACECAST(N, DCI);
case ISD::AND:
return PerformANDCombine(N, DCI);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return combineMulWide(N, DCI, OptLevel);
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 86d6f7c..41bfe7e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
-def doMulWide : Predicate<"doMulWide">;
-
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
@@ -836,36 +834,28 @@ def MULWIDES64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">;
def MULWIDES64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">;
-def MULWIDES64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">;
def MULWIDEU64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">;
def MULWIDEU64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">;
-def MULWIDEU64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">;
def MULWIDES32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">;
def MULWIDES32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">;
-def MULWIDES32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">;
def MULWIDEU32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">;
def MULWIDEU32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">;
-def MULWIDEU32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">;
-def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>;
// Matchers for signed, unsigned mul.wide ISD nodes.
-let Predicates = [doMulWide] in {
+let Predicates = [hasOptEnabled] in {
def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
@@ -877,85 +867,6 @@ let Predicates = [doMulWide] in {
def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
}
-// Predicates used for converting some patterns to mul.wide.
-def SInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(32);
-}]>;
-
-def UInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(32);
-}]>;
-
-def SInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(16);
-}]>;
-
-def UInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(16);
-}]>;
-
-def IntConst_0_30 : PatLeaf<(imm), [{
- // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(31);
-}]>;
-
-def IntConst_0_14 : PatLeaf<(imm), [{
- // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(15);
-}]>;
-
-def SHL2MUL32 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(32, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
-}]>;
-
-def SHL2MUL16 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(16, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
-}]>;
-
-// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-let Predicates = [doMulWide] in {
- def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDES64Imm $a, (SHL2MUL32 $b))>;
- def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDEU64Imm $a, (SHL2MUL32 $b))>;
-
- def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDES32Imm $a, (SHL2MUL16 $b))>;
- def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDEU32Imm $a, (SHL2MUL16 $b))>;
-
- // Convert "sign/zero-extend then multiply" to mul.wide.
- def : Pat<(mul (sext i32:$a), (sext i32:$b)),
- (MULWIDES64 $a, $b)>;
- def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>;
-
- def : Pat<(mul (zext i32:$a), (zext i32:$b)),
- (MULWIDEU64 $a, $b)>;
- def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>;
-
- def : Pat<(mul (sext i16:$a), (sext i16:$b)),
- (MULWIDES32 $a, $b)>;
- def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>;
-
- def : Pat<(mul (zext i16:$a), (zext i16:$b)),
- (MULWIDEU32 $a, $b)>;
- def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>;
-}
-
//
// Integer multiply-add
//
@@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>;
defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>;
}
+multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> {
+ def rrr:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>;
+ def rri:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>;
+ def rir:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>;
+ def rii:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>;
+}
+
+def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>;
+def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>;
+
+let Predicates = [hasOptEnabled] in {
+defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>;
+defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>;
+}
+
foreach t = [I16RT, I32RT, I64RT] in {
def NEG_S # t.Size :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5779d4e..0e8828f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -243,8 +243,6 @@ public:
createObjectTargetWriter() const override {
return createPPCXCOFFObjectWriter(TT.isArch64Bit());
}
-
- std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
};
} // end anonymous namespace
@@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
return std::nullopt;
}
-std::optional<MCFixupKind>
-XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const {
- return StringSwitch<std::optional<MCFixupKind>>(Name)
- .Case("R_REF", PPC::fixup_ppc_nofixup)
- .Default(std::nullopt);
-}
-
MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 9e8ee9f..df0c666 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -48,8 +48,7 @@ enum Fixups {
/// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
/// TLS general and local dynamic models, or inserts the thread-pointer
- /// register number. It can also be used to tie the ref symbol to prevent it
- /// from being garbage collected on AIX.
+ /// register number.
fixup_ppc_nofixup,
/// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index f75ab62..a04f404 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
switch ((unsigned)Fixup.getKind()) {
default:
report_fatal_error("Unimplemented fixup kind.");
+ case XCOFF::RelocationType::R_REF:
+ return {XCOFF::RelocationType::R_REF, 0};
case PPC::fixup_ppc_half16: {
const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
switch (Specifier) {
@@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
case PPC::fixup_ppc_br24abs:
return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
- case PPC::fixup_ppc_nofixup: {
- if (Specifier == PPC::S_None)
- return {XCOFF::RelocationType::R_REF, 0};
- else
- llvm_unreachable("Unsupported Modifier");
- } break;
case FK_Data_4:
case FK_Data_8:
const uint8_t SignAndSizeForFKData =
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index d295f35..1dc485d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
(COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
}
-class XXEvalPattern <dag pattern, bits<8> imm> :
- Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+// =============================================================================
+// XXEVAL Instruction Pattern Definitions
+// =============================================================================
+//
+// XXEVAL instruction performs 256 different logical operations on three vector
+// operands using an 8-bit immediate value to select the operation.
+// Format: xxeval XT, XA, XB, XC, IMM
+// For example:
+// Equivalent function A?xor(B,C):and(B,C) is performed by
+// xxeval XT, XA, XB, XC, 22
+//
+// REGISTER CLASS CONSTRAINTS:
+// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64]
+// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC
+// =============================================================================
+
+class XXEvalPattern<dag pattern, bits<8> imm>
+ : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+
+class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm>
+ : Pat<(Vt InputPattern),
+ !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)),
+ // VSRC path: direct XXEVAL for v4i32 and v2i64
+ (XXEVAL $vA, $vB, $vC, Imm),
+ // VRRC path: wrap with COPY_TO_REGCLASS for other types
+ (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC),
+ (COPY_TO_REGCLASS Vt:$vB, VSRC),
+ (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm),
+ VRRC))> {}
+
+// =============================================================================
+// PatFrags for Bitcast-Aware Vector bitwise Operations
+//
+// Each PatFrags defines TWO alternatives for pattern matcher to choose:
+// - Direct operation (for v4i32)
+// - Bitcast operation (for other types: v2i64, v16i8, v8i16)
+// =============================================================================
+
+// Basic Binary Operations
+def VAnd
+ : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b),
+ (bitconvert(and
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VXor
+ : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b),
+ (bitconvert(xor
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b),
+ (bitconvert(or
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VNot
+ : PatFrags<(ops node:$a), [(vnot node:$a),
+ (bitconvert(vnot(v4i32(bitconvert node:$a))))]>;
+
+// Derived bitwise operations
+// Vector NOR operation (not(or))
+def VNor
+ : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)),
+ (bitconvert(vnot(or
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b)))))]>;
+
+// Vector EQV operation (not(xor))
+def VEqv
+ : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)),
+ (bitconvert(vnot(xor
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b)))))]>;
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT)
+// - AND(B,C) is the "false" case op on vectors B and C
+// =============================================================================
+multiclass XXEvalTernarySelectAnd<ValueType Vt> {
+ // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 22>;
+
+ // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 24>;
+
+ // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 25>;
+
+ // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>;
+
+ // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>;
+}
let Predicates = [PrefixInstrs, HasP10Vector] in {
let AddedComplexity = 400 in {
@@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
// (xor A, (or B, C))
def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
+ // XXEval Patterns for ternary Operations.
+ foreach Ty = [v4i32, v2i64, v8i16, v16i8] in {
+ defm : XXEvalTernarySelectAnd<Ty>;
+ }
+
// Anonymous patterns to select prefixed VSX loads and stores.
// Load / Store f128
def : Pat<(f128 (load PDForm:$src)),
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 4c303a9..da6b95d 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt,
// Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31.
def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt,
(sequence "X%u", 16, 31))>;
+
+def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>;
+def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs,
+ (sequence "X%u", 16, 31))>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 34910b7..f223fdbe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
// Transform (sra (shl X, C1) C2) with C1 < C2
// -> (SignedBitfieldExtract X, msb, lsb)
if (N0.getOpcode() == ISD::SHL) {
- auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N01C)
return false;
@@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
// Transform (sra (shl X, C1) C2) with C1 > C2
// -> (NDS.BFOS X, lsb, msb)
if (N0.getOpcode() == ISD::SHL) {
- auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N01C)
return false;
@@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C)
// where C2 has 32 leading zeros and C3 trailing zeros.
SDNode *SRLIW = CurDAG->getMachineNode(
- RISCV::SRLIW, DL, VT, N0->getOperand(0),
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(TrailingZeros, DL, VT));
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
@@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// - without Zba a tablegen pattern applies the very same
// transform as we would have done here
SDNode *SLLI = CurDAG->getMachineNode(
- RISCV::SLLI, DL, VT, N0->getOperand(0),
+ RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LeadingZeros, DL, VT));
SDNode *SRLI = CurDAG->getMachineNode(
RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
@@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
unsigned TrailingZeros = llvm::countr_zero(Mask);
if (LeadingZeros == 32 && TrailingZeros > ShAmt) {
SDNode *SRLIW = CurDAG->getMachineNode(
- RISCV::SRLIW, DL, VT, N0->getOperand(0),
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(TrailingZeros, DL, VT));
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
@@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (TrailingOnes == 32) {
SDNode *SRLI = CurDAG->getMachineNode(
Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT,
- N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
+ N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
ReplaceNode(Node, SRLI);
return;
}
@@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (HasBitTest && ShAmt + 1 == TrailingOnes) {
SDNode *BEXTI = CurDAG->getMachineNode(
Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT,
- N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
+ N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
ReplaceNode(Node, BEXTI);
return;
}
const unsigned Msb = TrailingOnes - 1;
const unsigned Lsb = ShAmt;
- if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb))
+ if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb))
return;
unsigned LShAmt = Subtarget->getXLen() - TrailingOnes;
SDNode *SLLI =
- CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LShAmt, DL, VT));
SDNode *SRLI = CurDAG->getMachineNode(
RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
@@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
break;
unsigned LShAmt = Subtarget->getXLen() - ExtSize;
SDNode *SLLI =
- CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LShAmt, DL, VT));
SDNode *SRAI = CurDAG->getMachineNode(
RISCV::SRAI, DL, VT, SDValue(SLLI, 0),
@@ -2942,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
/// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
SDValue &Offset) {
- // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
- // a 9-bit immediate can be folded.
+ if (SelectAddrFrameIndex(Addr, Base, Offset))
+ return true;
SDLoc DL(Addr);
MVT VT = Addr.getSimpleValueType();
@@ -2953,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
if (isUInt<9>(CVal)) {
Base = Addr.getOperand(0);
- // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
- // a 9-bit immediate can be folded.
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 607edd3..b47d89b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2739,27 +2739,6 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
}
}
-bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
- EVT ScalarTy) const {
- if (!ScalarTy.isSimple())
- return false;
- switch (ScalarTy.getSimpleVT().SimpleTy) {
- case MVT::iPTR:
- return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- case MVT::f16:
- case MVT::bf16:
- case MVT::f32:
- return true;
- case MVT::i64:
- case MVT::f64:
- return Subtarget.hasVInstructionsI64();
- default:
- return false;
- }
-}
unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
return NumRepeatedDivisors;
@@ -20751,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getAllOnesConstant(DL, VT);
return DAG.getConstant(0, DL, VT);
}
+ case Intrinsic::riscv_vsseg2_mask:
+ case Intrinsic::riscv_vsseg3_mask:
+ case Intrinsic::riscv_vsseg4_mask:
+ case Intrinsic::riscv_vsseg5_mask:
+ case Intrinsic::riscv_vsseg6_mask:
+ case Intrinsic::riscv_vsseg7_mask:
+ case Intrinsic::riscv_vsseg8_mask: {
+ SDValue Tuple = N->getOperand(2);
+ unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+
+ if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() ||
+ Tuple.getOpcode() != RISCVISD::TUPLE_INSERT ||
+ !Tuple.getOperand(0).isUndef())
+ return SDValue();
+
+ SDValue Val = Tuple.getOperand(1);
+ unsigned Idx = Tuple.getConstantOperandVal(2);
+
+ unsigned SEW = Val.getValueType().getScalarSizeInBits();
+ assert(Log2_64(SEW) == N->getConstantOperandVal(6) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/N->getOperand(0),
+ /*IntID=*/
+ DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT),
+ /*StoredVal=*/Val,
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/N->getOperand(4),
+ /*VL=*/N->getOperand(5)};
+
+ auto *OldMemSD = cast<MemIntrinsicSDNode>(N);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = OldMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT,
+ MMO);
+ }
}
}
case ISD::EXPERIMENTAL_VP_REVERSE:
@@ -20843,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+ case RISCVISD::TUPLE_EXTRACT: {
+ EVT VT = N->getValueType(0);
+ SDValue Tuple = N->getOperand(0);
+ unsigned Idx = N->getConstantOperandVal(1);
+ if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ break;
+
+ unsigned NF = 0;
+ switch (Tuple.getConstantOperandVal(1)) {
+ default:
+ break;
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask:
+ NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+ break;
+ }
+
+ if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
+ break;
+
+ unsigned SEW = VT.getScalarSizeInBits();
+ assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/Tuple.getOperand(0),
+ /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
+ /*Passthru=*/Tuple.getOperand(2),
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/Tuple.getOperand(4),
+ /*VL=*/Tuple.getOperand(5),
+ /*Policy=*/Tuple.getOperand(6)};
+
+ auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList({VT, MVT::Other});
+ SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
+ Ops, MemVT, MMO);
+ DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
+ return Result.getValue(0);
+ }
+ case RISCVISD::TUPLE_INSERT: {
+ // tuple_insert tuple, undef, idx -> tuple
+ if (N->getOperand(1).isUndef())
+ return N->getOperand(0);
+ break;
+ }
}
return SDValue();
@@ -22367,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
+ case CallingConv::PreserveMost:
case CallingConv::GRAAL:
case CallingConv::RISCV_VectorCall:
#define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN:
@@ -24260,7 +24349,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
return false;
EVT ScalarType = DataType.getScalarType();
- if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
+ if (!isLegalElementTypeForRVV(ScalarType))
return false;
if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a788c0b7..ca70c46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -384,7 +384,6 @@ public:
bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;
bool isLegalElementTypeForRVV(EVT ScalarTy) const;
- bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index dd365cf..8297d50 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtP] in {
+let IsSignExtendingOpW = 1 in
def CLS : Unary_r<0b011000000011, 0b001, "cls">;
def ABS : Unary_r<0b011000000111, 0b001, "abs">;
} // Predicates = [HasStdExtP]
@@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in {
def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
+let IsSignExtendingOpW = 1 in {
def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
+}
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f391300..5265613 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in {
def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">;
def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">;
- let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
- def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs),
- (ins uimm5nonzero:$imm),
- "qc.c.delay", "$imm"> {
- let Inst{12} = 0;
- let Inst{11-7} = 0;
- let Inst{6-2} = imm{4-0};
- }
+ // qc.c.delay implemented as an alias, below
} // Predicates = [HasVendorXqcisync, IsRV32]
let Predicates = [HasVendorXqcisim, IsRV32] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
- def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10),
- "qc.psyscalli", "$imm10"> {
- bits<10> imm10;
-
- let rs1 = 0;
- let rd = 0;
- let imm12 = {0b00, imm10};
- }
-
def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8),
"qc.pputci", "$imm8"> {
bits<8> imm8;
@@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
let imm12 = {0b0100, imm8};
}
- def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">;
- def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">;
- def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">;
- def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">;
- def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">;
- def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">;
- def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">;
-
- def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> {
- let rd = 0;
- let imm = 0;
- }
+ // The other instructions are all implemented as aliases, below
} // mayLoad = 0, mayStore = 0, hasSideEffects = 1
} // Predicates = [HasVendorXqcisim, IsRV32]
@@ -1218,6 +1191,27 @@ let EmitPriority = 0 in {
} // EmitPriority = 0
} // Predicates = [HasVendorXqcilo, IsRV32]
+let Predicates = [HasVendorXqcisim, IsRV32] in {
+let EmitPriority = 1 in {
+ def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>;
+
+ def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>;
+ def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>;
+ def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>;
+ def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>;
+ def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>;
+ def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>;
+ def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>;
+ def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>;
+} // EmitPriority = 1
+} // Predicates = [HasVendorXqcisim, IsRV32]
+
+let Predicates = [HasVendorXqcisync, IsRV32] in {
+let EmitPriority = 1 in {
+ def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>;
+}
+} // Predicates = [HasVendorXqcisync, IsRV32]
+
//===----------------------------------------------------------------------===//
// Pseudo-instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 3cbe668..726920e 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
if (!isTypeLegal(VT))
return false;
- if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
+ if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
Alignment))
return false;
@@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- // If the segment load is going to be performed segment at a time anyways
- // and there's only one element used, use a strided load instead. This
- // will be equally fast, and create less vector register pressure.
- if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
- unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
- // the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
- {VTy, BasePtr->getType(), Stride->getType()},
- {BasePtr, Stride, Mask, VL});
- Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
- CI->addParamAttr(0,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- Shuffles[0]->replaceAllUsesWith(CI);
- return true;
- };
-
CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
@@ -289,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- unsigned Index;
- // If the segment store only has one active lane (i.e. the interleave is
- // just a spread shuffle), we can use a strided store instead. This will
- // be equally fast, and create less vector register pressure.
- if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
- isSpreadMask(Mask, Factor, Index)) {
- unsigned ScalarSizeInBytes =
- DL.getTypeStoreSize(ShuffleVTy->getElementType());
- Value *Data = SVI->getOperand(0);
- Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at
- // most the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
- {VTy, BasePtr->getType(), Stride->getType()},
- {Data, BasePtr, Stride, LaneMask, VL});
- Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
- CI->addParamAttr(1,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- return true;
- }
-
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 5404123..7e58b6f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
return CSR_NoRegs_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+ return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList
+ : CSR_RT_MostRegs_SaveList;
if (MF->getFunction().hasFnAttribute("interrupt")) {
if (Subtarget.hasVInstructions()) {
if (Subtarget.hasStdExtD())
@@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int64_t Val = Offset.getFixed();
int64_t Lo12 = SignExtend64<12>(Val);
unsigned Opc = MI.getOpcode();
+
if (Opc == RISCV::ADDI && !isInt<12>(Val)) {
// We chose to emit the canonical immediate sequence rather than folding
// the offset into the using add under the theory that doing so doesn't
@@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
(Lo12 & 0b11111) != 0) {
// Prefetch instructions require the offset to be 32 byte aligned.
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+ } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) {
+ // MIPS Prefetch instructions require the offset to be 9 bits encoded.
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
} else if ((Opc == RISCV::PseudoRV32ZdinxLD ||
Opc == RISCV::PseudoRV32ZdinxSD) &&
Lo12 >= 2044) {
@@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
if (CC == CallingConv::GHC)
return CSR_NoRegs_RegMask;
- switch (Subtarget.getTargetABI()) {
+ RISCVABI::ABI ABI = Subtarget.getTargetABI();
+ if (CC == CallingConv::PreserveMost) {
+ if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+ return CSR_RT_MostRegs_RVE_RegMask;
+ return CSR_RT_MostRegs_RegMask;
+ }
+ switch (ABI) {
default:
llvm_unreachable("Unrecognized ABI");
case RISCVABI::ABI_ILP32E:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fd634b5..61dbd06 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{
{Intrinsic::roundeven, MVT::f64, 9},
{Intrinsic::rint, MVT::f32, 7},
{Intrinsic::rint, MVT::f64, 7},
- {Intrinsic::lrint, MVT::i32, 1},
- {Intrinsic::lrint, MVT::i64, 1},
- {Intrinsic::llrint, MVT::i64, 1},
{Intrinsic::nearbyint, MVT::f32, 9},
{Intrinsic::nearbyint, MVT::f64, 9},
{Intrinsic::bswap, MVT::i16, 3},
@@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
switch (ICA.getID()) {
case Intrinsic::lrint:
case Intrinsic::llrint:
- // We can't currently lower half or bfloat vector lrint/llrint.
- if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
- VecTy && VecTy->getElementType()->is16bitFPTy())
- return InstructionCost::getInvalid();
- [[fallthrough]];
+ case Intrinsic::lround:
+ case Intrinsic::llround: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ Type *SrcTy = ICA.getArgTypes().front();
+ auto SrcLT = getTypeLegalizationCost(SrcTy);
+ if (ST->hasVInstructions() && LT.second.isVector()) {
+ SmallVector<unsigned, 2> Ops;
+ unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
+ unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
+ if (LT.second.getVectorElementType() == MVT::bf16) {
+ if (!ST->hasVInstructionsBF16Minimal())
+ return InstructionCost::getInvalid();
+ if (DstEltSz == 32)
+ Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
+ else
+ Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
+ } else if (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16()) {
+ if (!ST->hasVInstructionsF16Minimal())
+ return InstructionCost::getInvalid();
+ if (DstEltSz == 32)
+ Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
+ else
+ Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
+
+ } else if (SrcEltSz > DstEltSz) {
+ Ops = {RISCV::VFNCVT_X_F_W};
+ } else if (SrcEltSz < DstEltSz) {
+ Ops = {RISCV::VFWCVT_X_F_V};
+ } else {
+ Ops = {RISCV::VFCVT_X_F_V};
+ }
+
+ // We need to use the source LMUL in the case of a narrowing op, and the
+ // destination LMUL otherwise.
+ if (SrcEltSz > DstEltSz)
+ return SrcLT.first *
+ getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
+ return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
+ }
+ break;
+ }
case Intrinsic::ceil:
case Intrinsic::floor:
case Intrinsic::trunc:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index f0510ec..d62d99c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -265,7 +265,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@@ -297,7 +297,7 @@ public:
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 5cda6a0..7505507 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass {
// Returns the loaded value.
Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType,
FixedVectorType *TargetType, Value *Source) {
- // We expect the codegen to avoid doing implicit bitcast from a load.
- assert(TargetType->getElementType() == SourceType->getElementType());
- assert(TargetType->getNumElements() < SourceType->getNumElements());
-
+ assert(TargetType->getNumElements() <= SourceType->getNumElements());
LoadInst *NewLoad = B.CreateLoad(SourceType, Source);
buildAssignType(B, SourceType, NewLoad);
+ Value *AssignValue = NewLoad;
+ if (TargetType->getElementType() != SourceType->getElementType()) {
+ AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast,
+ {TargetType, SourceType}, {NewLoad});
+ buildAssignType(B, TargetType, AssignValue);
+ }
SmallVector<int> Mask(/* Size= */ TargetType->getNumElements());
for (unsigned I = 0; I < TargetType->getNumElements(); ++I)
Mask[I] = I;
- Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask);
+ Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask);
buildAssignType(B, TargetType, Output);
return Output;
}
@@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass {
Output = loadFirstValueFromAggregate(B, SVT->getElementType(),
OriginalOperand, LI);
}
- // Destination is a smaller vector than source.
+ // Destination is a smaller vector than source or different vector type.
// - float3 v3 = vector4;
+ // - float4 v2 = int4;
else if (SVT && DVT)
Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand);
// Destination is the scalar type stored at the start of an aggregate.
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 721f64a..1995e0f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal();
}
+ getActionDefinitionsBuilder(G_IS_FPCLASS).custom();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType,
bool SPIRVLegalizerInfo::legalizeCustom(
LegalizerHelper &Helper, MachineInstr &MI,
LostDebugLocObserver &LocObserver) const {
- auto Opc = MI.getOpcode();
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- if (Opc == TargetOpcode::G_ICMP) {
+ switch (MI.getOpcode()) {
+ default:
+ // TODO: implement legalization for other opcodes.
+ return true;
+ case TargetOpcode::G_IS_FPCLASS:
+ return legalizeIsFPClass(Helper, MI, LocObserver);
+ case TargetOpcode::G_ICMP: {
assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg()));
auto &Op0 = MI.getOperand(2);
auto &Op1 = MI.getOperand(3);
@@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom(
}
return true;
}
- // TODO: implement legalization for other opcodes.
+ }
+}
+
+// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted
+// to ensure that all instructions created during the lowering have SPIR-V types
+// assigned to them.
+bool SPIRVLegalizerInfo::legalizeIsFPClass(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
+ auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+ FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
+
+ auto &MIRBuilder = Helper.MIRBuilder;
+ auto &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ Type *LLVMDstTy =
+ IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits());
+ if (DstTy.isVector())
+ LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount());
+ SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType(
+ LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+
+ unsigned BitSize = SrcTy.getScalarSizeInBits();
+ const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
+
+ LLT IntTy = LLT::scalar(BitSize);
+ Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize);
+ if (SrcTy.isVector()) {
+ IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
+ LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount());
+ }
+ SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType(
+ LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+
+ // Clang doesn't support capture of structured bindings:
+ LLT DstTyCopy = DstTy;
+ const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) {
+ // Assign this MI's (assumed only) destination to one of the two types we
+ // expect: either the G_IS_FPCLASS's destination type, or the integer type
+ // bitcast from the source type.
+ LLT MITy = MRI.getType(MI.getReg(0));
+ assert((MITy == IntTy || MITy == DstTyCopy) &&
+ "Unexpected LLT type while lowering G_IS_FPCLASS");
+ auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy;
+ GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF);
+ return MI;
+ };
+
+ // Helper to build and assign a constant in one go
+ const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder {
+ if (!Ty.isFixedVector())
+ return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C));
+ auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C);
+ assert((Ty == IntTy || Ty == DstTyCopy) &&
+ "Unexpected LLT type while lowering constant for G_IS_FPCLASS");
+ SPIRVType *VecEltTy = GR->getOrCreateSPIRVType(
+ (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder,
+ SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+ GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF);
+ return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC));
+ };
+
+ if (Mask == fcNone) {
+ MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0));
+ MI.eraseFromParent();
+ return true;
+ }
+ if (Mask == fcAllFlags) {
+ MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1));
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Note that rather than creating a COPY here (between a floating-point and
+ // integer type of the same size) we create a SPIR-V bitcast immediately. We
+ // can't create a G_BITCAST because the LLTs are the same, and we can't seem
+ // to correctly lower COPYs to SPIR-V bitcasts at this moment.
+ Register ResVReg = MRI.createGenericVirtualRegister(IntTy);
+ MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy));
+ GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF());
+ auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast)
+ .addDef(ResVReg)
+ .addUse(GR->getSPIRVTypeID(SPIRVIntTy))
+ .addUse(SrcReg);
+ AsInt = assignSPIRVTy(std::move(AsInt));
+
+ // Various masks.
+ APInt SignBit = APInt::getSignMask(BitSize);
+ APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign.
+ APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
+ APInt ExpMask = Inf;
+ APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
+ APInt QNaNBitMask =
+ APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
+ APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
+
+ auto SignBitC = buildSPIRVConstant(IntTy, SignBit);
+ auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask);
+ auto InfC = buildSPIRVConstant(IntTy, Inf);
+ auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask);
+ auto ZeroC = buildSPIRVConstant(IntTy, 0);
+
+ auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC));
+ auto Sign = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs));
+
+ auto Res = buildSPIRVConstant(DstTy, 0);
+
+ const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) {
+ Res = assignSPIRVTy(
+ MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend))));
+ };
+
+ // Tests that involve more than one class should be processed first.
+ if ((Mask & fcFinite) == fcFinite) {
+ // finite(V) ==> abs(V) u< exp_mask
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
+ ExpMaskC));
+ Mask &= ~fcFinite;
+ } else if ((Mask & fcFinite) == fcPosFinite) {
+ // finite(V) && V > 0 ==> V u< exp_mask
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
+ ExpMaskC));
+ Mask &= ~fcPosFinite;
+ } else if ((Mask & fcFinite) == fcNegFinite) {
+ // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
+ auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT,
+ DstTy, Abs, ExpMaskC));
+ appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign));
+ Mask &= ~fcNegFinite;
+ }
+
+ if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
+ // fcZero | fcSubnormal => test all exponent bits are 0
+ // TODO: Handle sign bit specific cases
+ // TODO: Handle inverted case
+ if (PartialCheck == (fcZero | fcSubnormal)) {
+ auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC));
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ ExpBits, ZeroC));
+ Mask &= ~PartialCheck;
+ }
+ }
+
+ // Check for individual classes.
+ if (FPClassTest PartialCheck = Mask & fcZero) {
+ if (PartialCheck == fcPosZero)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, ZeroC));
+ else if (PartialCheck == fcZero)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
+ else // fcNegZero
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, SignBitC));
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcSubnormal) {
+ // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
+ // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
+ auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
+ auto OneC = buildSPIRVConstant(IntTy, 1);
+ auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
+ auto SubnormalRes = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
+ buildSPIRVConstant(IntTy, AllOneMantissa)));
+ if (PartialCheck == fcNegSubnormal)
+ SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
+ appendToRes(std::move(SubnormalRes));
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcInf) {
+ if (PartialCheck == fcPosInf)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, InfC));
+ else if (PartialCheck == fcInf)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
+ else { // fcNegInf
+ APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
+ auto NegInfC = buildSPIRVConstant(IntTy, NegInf);
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, NegInfC));
+ }
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcNan) {
+ auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+ if (PartialCheck == fcNan) {
+ // isnan(V) ==> abs(V) u> int(inf)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
+ } else if (PartialCheck == fcQNan) {
+ // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
+ InfWithQnanBitC));
+ } else { // fcSNan
+ // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
+ // abs(V) u< (unsigned(Inf) | quiet_bit)
+ auto IsNan = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
+ auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp(
+ CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC));
+ appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
+ }
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcNormal) {
+ // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
+ // (max_exp-1))
+ APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
+ auto ExpMinusOne = assignSPIRVTy(
+ MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
+ APInt MaxExpMinusOne = ExpMask - ExpLSB;
+ auto NormalRes = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
+ buildSPIRVConstant(IntTy, MaxExpMinusOne)));
+ if (PartialCheck == fcNegNormal)
+ NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
+ else if (PartialCheck == fcPosNormal) {
+ auto PosSign = assignSPIRVTy(MIRBuilder.buildXor(
+ DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask)));
+ NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
+ }
+ appendToRes(std::move(NormalRes));
+ }
+
+ MIRBuilder.buildCopy(DstReg, Res);
+ MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
index 6335f21..eeefa42 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
@@ -30,6 +30,10 @@ public:
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
LostDebugLocObserver &LocObserver) const override;
SPIRVLegalizerInfo(const SPIRVSubtarget &ST);
+
+private:
+ bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index cd434f7..3f80b2a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (VT != MVT::v8i32 && VT != MVT::v16i32)
return SDValue();
@@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performMulCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == ISD::MUL);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+ return Res;
+
+ // We don't natively support v16i8 mul, but we do support v8i16 so split the
+ // inputs and extend them to v8i16. Only do this before legalization in case
+ // a narrow vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+ SDValue MulHigh = DAG.getBitcast(
+ VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::MUL:
- return performMulCombine(N, DCI.DAG);
+ return performMulCombine(N, DCI);
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 28f6599..c3990d1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
for (Instruction &I : BB) {
if (I.getType()->isVoidTy())
continue;
+
+ if (isa<AllocaInst>(&I)) {
+ // If the alloca has any lifetime marker that is no longer dominated
+ // by the alloca, remove all lifetime markers. Lifetime markers must
+ // always work directly on the alloca, and this is no longer possible.
+ bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) {
+ auto *UserI = cast<Instruction>(U);
+ return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI);
+ });
+ if (HasNonDominatedLifetimeMarker) {
+ for (User *U : make_early_inc_range(I.users())) {
+ auto *UserI = cast<Instruction>(U);
+ if (UserI->isLifetimeStartOrEnd())
+ UserI->eraseFromParent();
+ }
+ }
+ }
+
unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
// If a value is defined by an invoke instruction, it is only available in
// its normal destination and not in its unwind destination.
@@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
// Setjmp preparation
+ SmallVector<AllocaInst *> StaticAllocas;
+ for (Instruction &I : F.getEntryBlock())
+ if (auto *AI = dyn_cast<AllocaInst>(&I))
+ if (AI->isStaticAlloca())
+ StaticAllocas.push_back(AI);
+
BasicBlock *Entry = &F.getEntryBlock();
DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
SplitBlock(Entry, &*Entry->getFirstInsertionPt());
+ // Move static allocas back into the entry block, so they stay static.
+ for (AllocaInst *AI : StaticAllocas)
+ AI->moveBefore(Entry->getTerminator()->getIterator());
+
IRB.SetInsertPoint(Entry->getTerminator()->getIterator());
// This alloca'ed pointer is used by the runtime to identify function
// invocations. It's just for pointer comparisons. It will never be
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 1bf9f8b..f9bd233 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources}
IRPrinter
Instrumentation
MC
+ ObjCARC
ProfileData
Scalar
SelectionDAG
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 11ab8dc..7244a6d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58074,11 +58074,9 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
// res, flags2 = sub 0, (and X, Y)
// cload/cstore ..., cond_ne, flag2
// ->
- // res, flags2 = and X, Y
+ // res, flags2 = cmp (and X, Y), 0
// cload/cstore ..., cond_ne, flag2
- Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0),
- Op1.getOperand(1))
- .getValue(1);
+ Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Op1, Sub.getOperand(0));
} else {
return SDValue();
}
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 64b33e4..ab906f9 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1568,7 +1568,7 @@ private:
if (DebugLoc SuspendLoc = S->getDebugLoc()) {
std::string LabelName =
("__coro_resume_" + Twine(SuspendIndex)).str();
- DILocation &DILoc = *SuspendLoc.get();
+ DILocation &DILoc = *SuspendLoc;
DILabel *ResumeLabel =
DBuilder.createLabel(DIS, LabelName, DILoc.getFile(),
SuspendLoc.getLine(), SuspendLoc.getCol(),
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index f43202e..8262c8c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes,
struct SCCNodesResult {
SCCNodeSet SCCNodes;
- bool HasUnknownCall;
};
} // end anonymous namespace
@@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes,
static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
SCCNodesResult Res;
- Res.HasUnknownCall = false;
for (Function *F : Functions) {
if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
F->isPresplitCoroutine()) {
- // Treat any function we're trying not to optimize as if it were an
- // indirect call and omit it from the node set used below.
- Res.HasUnknownCall = true;
+ // Omit any functions we're trying not to optimize from the set.
continue;
}
- // Track whether any functions in this SCC have an unknown call edge.
- // Note: if this is ever a performance hit, we can common it with
- // subsequent routines which also do scans over the instructions of the
- // function.
- if (!Res.HasUnknownCall) {
- for (Instruction &I : instructions(*F)) {
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- if (!CB->getCalledFunction()) {
- Res.HasUnknownCall = true;
- break;
- }
- }
- }
- }
+
Res.SCCNodes.insert(F);
}
return Res;
@@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
addColdAttrs(Nodes.SCCNodes, Changed);
addWillReturn(Nodes.SCCNodes, Changed);
addNoUndefAttrs(Nodes.SCCNodes, Changed);
-
- // If we have no external nodes participating in the SCC, we can deduce some
- // more precise attributes as well.
- if (!Nodes.HasUnknownCall) {
- addNoAliasAttrs(Nodes.SCCNodes, Changed);
- addNonNullAttrs(Nodes.SCCNodes, Changed);
- inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
- addNoRecurseAttrs(Nodes.SCCNodes, Changed);
- }
+ addNoAliasAttrs(Nodes.SCCNodes, Changed);
+ addNonNullAttrs(Nodes.SCCNodes, Changed);
+ inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+ addNoRecurseAttrs(Nodes.SCCNodes, Changed);
// Finally, infer the maximal set of attributes from the ones we've inferred
// above. This is handling the cases where one attribute on a signature
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index f00796f..c009c1e 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
+STATISTIC(MismatchedCloneAssignments,
+ "Number of callsites assigned to call multiple non-matching clones");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -730,7 +732,7 @@ private:
/// of the functions tracked calls to their new versions in the CallMap.
/// Assigns new clones to clone number CloneNo.
FuncInfo cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
@@ -897,7 +899,7 @@ private:
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
@@ -989,7 +991,7 @@ private:
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
@@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}
+// Return the clone number of the given function by extracting it from the
+// memprof suffix. Assumes the caller has already confirmed it is a memprof
+// clone.
+static unsigned getMemProfCloneNum(const Function &F) {
+ assert(isMemProfClone(F));
+ auto Pos = F.getName().find_last_of('.');
+ assert(Pos > 0);
+ unsigned CloneNo;
+ bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
+ assert(!Err);
+ (void)Err;
+ return CloneNo;
+}
+
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
- if (CalleeFunc.cloneNo() > 0)
+ auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ if (isMemProfClone(*CurF)) {
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
+ if (CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ }
+ if (NewCalleeCloneNo > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
@@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
- CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ CurCalleeCloneNo = NewCalleeCloneNo;
}
// Update the debug information attached to NewFunc to use the clone Name. Note
@@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Use existing LLVM facilities for cloning and obtaining Call in clone
ValueToValueMapTy VMap;
@@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
IndexCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Check how many clones we have of Call (and therefore function).
// The next clone number is the current size of versions array.
@@ -4463,7 +4506,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncInfo FuncClone;
// Remappings of each call of interest (from original uncloned call to the
// corresponding cloned call in this function clone).
- std::map<CallInfo, CallInfo> CallMap;
+ DenseMap<CallInfo, CallInfo> CallMap;
};
// Walk all functions for which we saw calls with memprof metadata, and handle
@@ -4499,7 +4542,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
assert(FuncCloneInfos.size() > FuncClone.cloneNo());
- std::map<CallInfo, CallInfo> &CallMap =
+ DenseMap<CallInfo, CallInfo> &CallMap =
FuncCloneInfos[FuncClone.cloneNo()].CallMap;
CallInfo CallClone(Call);
if (auto It = CallMap.find(Call); It != CallMap.end())
@@ -4551,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}));
// Initialize with empty call map, assign Clone to original function
// and its callers, and skip to the next clone.
- FuncCloneInfos.push_back({OrigFunc, {}});
+ FuncCloneInfos.push_back(
+ {OrigFunc, DenseMap<CallInfo, CallInfo>()});
AssignCallsiteCloneToFuncClone(
OrigFunc, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
@@ -4584,7 +4628,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// Clone function and save it along with the CallInfo map created
// during cloning in the FuncCloneInfos.
- std::map<CallInfo, CallInfo> NewCallMap;
+ DenseMap<CallInfo, CallInfo> NewCallMap;
unsigned CloneNo = FuncCloneInfos.size();
assert(CloneNo > 0 && "Clone 0 is the original function, which "
"should already exist in the map");
@@ -4691,7 +4735,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// CallMap is set up as indexed by original Call at clone 0.
CallInfo OrigCall(Callee->getOrigNode()->Call);
OrigCall.setCloneNo(0);
- std::map<CallInfo, CallInfo> &CallMap =
+ DenseMap<CallInfo, CallInfo> &CallMap =
FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
assert(CallMap.count(OrigCall));
CallInfo NewCall(CallMap[OrigCall]);
@@ -4714,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// where the callers were assigned to different clones of a function.
}
+ auto FindFirstAvailFuncClone = [&]() {
+ // Find first function in FuncCloneInfos without an assigned
+ // clone of this callsite Node. We should always have one
+ // available at this point due to the earlier cloning when the
+ // FuncCloneInfos size was smaller than the clone number.
+ for (auto &CF : FuncCloneInfos) {
+ if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
+ return CF.FuncClone;
+ }
+ llvm_unreachable(
+ "Expected an available func clone for this callsite clone");
+ };
+
// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
@@ -4830,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
- // Find first function in FuncCloneInfos without an assigned
- // clone of this callsite Node. We should always have one
- // available at this point due to the earlier cloning when the
- // FuncCloneInfos size was smaller than the clone number.
- for (auto &CF : FuncCloneInfos) {
- if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) {
- FuncCloneAssignedToCurCallsiteClone = CF.FuncClone;
- break;
- }
- }
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
@@ -4853,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneAssignedToCurCallsiteClone);
}
}
+ // If we didn't assign a function clone to this callsite clone yet, e.g.
+ // none of its callers has a non-null call, do the assignment here.
+ // We want to ensure that every callsite clone is assigned to some
+ // function clone, so that the call updates below work as expected.
+ // In particular if this is the original callsite, we want to ensure it
+ // is assigned to the original function, otherwise the original function
+ // will appear available for assignment to other callsite clones,
+ // leading to unintended effects. For one, the unknown and not updated
+ // callers will call into cloned paths leading to the wrong hints,
+ // because they still call the original function (clone 0). Also,
+ // because all callsites start out as being clone 0 by default, we can't
+ // easily distinguish between callsites explicitly assigned to clone 0
+ // vs those never assigned, which can lead to multiple updates of the
+ // calls when invoking updateCall below, with mismatched clone values.
+ // TODO: Add a flag to the callsite nodes or some other mechanism to
+ // better distinguish and identify callsite clones that are not getting
+ // assigned to function clones as expected.
+ if (!FuncCloneAssignedToCurCallsiteClone) {
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
+ assert(FuncCloneAssignedToCurCallsiteClone &&
+ "No available func clone for this callsite clone");
+ AssignCallsiteCloneToFuncClone(
+ FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+ /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
+ }
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b231c04..d7971e8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -11,10 +11,13 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/FloatingPointPredicateUtils.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -3589,6 +3592,154 @@ static Value *foldOrOfInversions(BinaryOperator &I,
return nullptr;
}
+/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl"
+/// patterns, which extract vector elements and pack them in the same relative
+/// positions.
+///
+/// \p Vec is the underlying vector being extracted from.
+/// \p Mask is a bitmask identifying which packed elements are obtained from the
+/// vector.
+/// \p VecOffset is the vector element corresponding to index 0 of the
+/// mask.
+static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec,
+ int64_t &VecOffset,
+ SmallBitVector &Mask,
+ const DataLayout &DL) {
+ static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) {
+ ShlAmt = 0;
+ return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base);
+ };
+
+ // First try to match extractelement -> zext -> shl
+ uint64_t VecIdx, ShlAmt;
+ if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt(
+ m_Value(Vec), m_ConstantInt(VecIdx))),
+ ShlAmt))) {
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+ auto *EltTy = dyn_cast<IntegerType>(VecTy->getElementType());
+ if (!EltTy)
+ return false;
+
+ const unsigned EltBitWidth = EltTy->getBitWidth();
+ const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth();
+ if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0)
+ return false;
+ const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth;
+ const unsigned ShlEltAmt = ShlAmt / EltBitWidth;
+
+ const unsigned MaskIdx =
+ DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1;
+
+ VecOffset = static_cast<int64_t>(VecIdx) - static_cast<int64_t>(MaskIdx);
+ Mask.resize(TargetEltWidth);
+ Mask.set(MaskIdx);
+ return true;
+ }
+
+ // Now try to match a bitcasted subvector.
+ Instruction *SrcVecI;
+ if (!match(V, m_BitCast(m_Instruction(SrcVecI))))
+ return false;
+
+ auto *SrcTy = dyn_cast<FixedVectorType>(SrcVecI->getType());
+ if (!SrcTy)
+ return false;
+
+ Mask.resize(SrcTy->getNumElements());
+
+ // First check for a subvector obtained from a shufflevector.
+ if (isa<ShuffleVectorInst>(SrcVecI)) {
+ Constant *ConstVec;
+ ArrayRef<int> ShuffleMask;
+ if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec),
+ m_Mask(ShuffleMask))))
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+
+ const unsigned NumVecElts = VecTy->getNumElements();
+ bool FoundVecOffset = false;
+ for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) {
+ if (ShuffleMask[Idx] == PoisonMaskElem)
+ return false;
+ const unsigned ShuffleIdx = ShuffleMask[Idx];
+ if (ShuffleIdx >= NumVecElts) {
+ const unsigned ConstIdx = ShuffleIdx - NumVecElts;
+ auto *ConstElt =
+ dyn_cast<ConstantInt>(ConstVec->getAggregateElement(ConstIdx));
+ if (!ConstElt || !ConstElt->isNullValue())
+ return false;
+ continue;
+ }
+
+ if (FoundVecOffset) {
+ if (VecOffset + Idx != ShuffleIdx)
+ return false;
+ } else {
+ if (ShuffleIdx < Idx)
+ return false;
+ VecOffset = ShuffleIdx - Idx;
+ FoundVecOffset = true;
+ }
+ Mask.set(Idx);
+ }
+ return FoundVecOffset;
+ }
+
+ // Check for a subvector obtained as an (insertelement V, 0, idx)
+ uint64_t InsertIdx;
+ if (!match(SrcVecI,
+ m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx))))
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+ VecOffset = 0;
+ bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx);
+ Mask.set();
+ if (!AlreadyInsertedMaskedElt)
+ Mask.reset(InsertIdx);
+ return true;
+}
+
+/// Try to fold the join of two scalar integers whose contents are packed
+/// elements of the same vector.
+static Instruction *foldIntegerPackFromVector(Instruction &I,
+ InstCombiner::BuilderTy &Builder,
+ const DataLayout &DL) {
+ assert(I.getOpcode() == Instruction::Or);
+ Value *LhsVec, *RhsVec;
+ int64_t LhsVecOffset, RhsVecOffset;
+ SmallBitVector Mask;
+ if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset,
+ Mask, DL))
+ return nullptr;
+ if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset,
+ Mask, DL))
+ return nullptr;
+ if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset)
+ return nullptr;
+
+ // Convert into shufflevector -> bitcast;
+ const unsigned ZeroVecIdx =
+ cast<FixedVectorType>(LhsVec->getType())->getNumElements();
+ SmallVector<int> ShuffleMask(Mask.size(), ZeroVecIdx);
+ for (unsigned Idx : Mask.set_bits()) {
+ assert(LhsVecOffset + Idx >= 0);
+ ShuffleMask[Idx] = LhsVecOffset + Idx;
+ }
+
+ Value *MaskedVec = Builder.CreateShuffleVector(
+ LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask,
+ I.getName() + ".v");
+ return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType());
+}
+
// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
// track these properities for preservation. Note that we can decompose
// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
@@ -3766,6 +3917,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (Instruction *X = foldComplexAndOrPatterns(I, Builder))
return X;
+ if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL))
+ return X;
+
// (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D
// (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C
if (Value *V = foldOrOfInversions(I, Builder))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index da9b126..b268fea 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -163,6 +163,11 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
LaterIndices.push_back(IdxVal);
}
+ Value *Idx = GEP->getOperand(2);
+ // If the index type is non-canonical, wait for it to be canonicalized.
+ if (Idx->getType() != DL.getIndexType(GEP->getType()))
+ return nullptr;
+
enum { Overdefined = -3, Undefined = -2 };
// Variables for our state machines.
@@ -290,17 +295,6 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
// Now that we've scanned the entire array, emit our new comparison(s). We
// order the state machines in complexity of the generated code.
- Value *Idx = GEP->getOperand(2);
-
- // If the index is larger than the pointer offset size of the target, truncate
- // the index down like the GEP would do implicitly. We don't have to do this
- // for an inbounds GEP because the index can't be out of range.
- if (!GEP->isInBounds()) {
- Type *PtrIdxTy = DL.getIndexType(GEP->getType());
- unsigned OffsetSize = PtrIdxTy->getIntegerBitWidth();
- if (Idx->getType()->getPrimitiveSizeInBits().getFixedValue() > OffsetSize)
- Idx = Builder.CreateTrunc(Idx, PtrIdxTy);
- }
// If inbounds keyword is not present, Idx * ElementSize can overflow.
// Let's assume that ElementSize is 2 and the wanted value is at offset 0.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e2a9255..9e33320 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2777,6 +2777,12 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
Indices.append(GEP.idx_begin()+1, GEP.idx_end());
}
+ // Don't create GEPs with more than one variable index.
+ unsigned NumVarIndices =
+ count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); });
+ if (NumVarIndices > 1)
+ return nullptr;
+
if (!Indices.empty())
return replaceInstUsesWith(
GEP, Builder.CreateGEP(
@@ -3199,6 +3205,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, NewGEP);
}
+ // Strip trailing zero indices.
+ auto *LastIdx = dyn_cast<Constant>(Indices.back());
+ if (LastIdx && LastIdx->isNullValue() && !LastIdx->getType()->isVectorTy()) {
+ return replaceInstUsesWith(
+ GEP, Builder.CreateGEP(GEP.getSourceElementType(), PtrOp,
+ drop_end(Indices), "", GEP.getNoWrapFlags()));
+ }
+
// Scalarize vector operands; prefer splat-of-gep.as canonical form.
// Note that this looses information about undef lanes; we run it after
// demanded bits to partially mitigate that loss.
@@ -3225,6 +3239,30 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, Res);
}
+ bool SeenVarIndex = false;
+ for (auto [IdxNum, Idx] : enumerate(Indices)) {
+ if (isa<Constant>(Idx))
+ continue;
+
+ if (!SeenVarIndex) {
+ SeenVarIndex = true;
+ continue;
+ }
+
+ // GEP has multiple variable indices: Split it.
+ ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
+ Value *FrontGEP =
+ Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
+ GEP.getName() + ".split", GEP.getNoWrapFlags());
+
+ SmallVector<Value *> BackIndices;
+ BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy));
+ append_range(BackIndices, drop_begin(Indices, IdxNum));
+ return GetElementPtrInst::Create(
+ GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP,
+ BackIndices, GEP.getNoWrapFlags());
+ }
+
// Check to see if the inputs to the PHI node are getelementptr instructions.
if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder))
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index df31f07..54d9a83 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Approximately handle AVX Galois Field Affine Transformation
+ //
+ // e.g.,
+ // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+ // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+ // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+ // Out A x b
+ // where A and x are packed matrices, b is a vector,
+ // Out = A * x + b in GF(2)
+ //
+ // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
+ // computation also includes a parity calculation.
+ //
+ // For the bitwise AND of bits V1 and V2, the exact shadow is:
+ // Out_Shadow = (V1_Shadow & V2_Shadow)
+ // | (V1 & V2_Shadow)
+ // | (V1_Shadow & V2 )
+ //
+ // We approximate the shadow of gf2p8affineqb using:
+ // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0)
+ // | gf2p8affineqb(x, A_shadow, 0)
+ // | gf2p8affineqb(x_Shadow, A, 0)
+ // | set1_epi8(b_Shadow)
+ //
+ // This approximation has false negatives: if an intermediate dot-product
+ // contains an even number of 1's, the parity is 0.
+ // It has no false positives.
+ void handleAVXGF2P8Affine(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+
+ assert(I.arg_size() == 3);
+ Value *A = I.getOperand(0);
+ Value *X = I.getOperand(1);
+ Value *B = I.getOperand(2);
+
+ assert(isFixedIntVector(A));
+ assert(cast<VectorType>(A->getType())
+ ->getElementType()
+ ->getScalarSizeInBits() == 8);
+
+ assert(A->getType() == X->getType());
+
+ assert(B->getType()->isIntegerTy());
+ assert(B->getType()->getScalarSizeInBits() == 8);
+
+ assert(I.getType() == A->getType());
+
+ Value *AShadow = getShadow(A);
+ Value *XShadow = getShadow(X);
+ Value *BZeroShadow = getCleanShadow(B);
+
+ CallInst *AShadowXShadow = IRB.CreateIntrinsic(
+ I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow});
+ CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {X, AShadow, BZeroShadow});
+ CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {XShadow, A, BZeroShadow});
+
+ unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
+ Value *BShadow = getShadow(B);
+ Value *BBroadcastShadow = getCleanShadow(AShadow);
+ // There is no LLVM IR intrinsic for _mm512_set1_epi8.
+ // This loop generates a lot of LLVM IR, which we expect that CodeGen will
+ // lower appropriately (e.g., VPBROADCASTB).
+ // Besides, b is often a constant, in which case it is fully initialized.
+ for (unsigned i = 0; i < NumElements; i++)
+ BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i);
+
+ setShadow(&I, IRB.CreateOr(
+ {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow}));
+ setOriginForNaryOp(I);
+ }
+
// Handle Arm NEON vector load intrinsics (vld*).
//
// The WithLane instructions (ld[234]lane) are similar to:
@@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ // AVX Galois Field New Instructions
+ case Intrinsic::x86_vgf2p8affineqb_128:
+ case Intrinsic::x86_vgf2p8affineqb_256:
+ case Intrinsic::x86_vgf2p8affineqb_512:
+ handleAVXGF2P8Affine(I);
+ break;
+
case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 0f63ed0..9b87180 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1360,13 +1360,10 @@ struct DSEState {
/// indicating whether \p I is a free-like call.
std::optional<std::pair<MemoryLocation, bool>>
getLocForTerminator(Instruction *I) const {
- uint64_t Len;
- Value *Ptr;
- if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
- m_Value(Ptr))))
- return {std::make_pair(MemoryLocation(Ptr, Len), false)};
-
if (auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
+ return {
+ std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
if (Value *FreedOp = getFreedOperand(CB, &TLI))
return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
}
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 6a3f656..1a52af1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -651,7 +651,7 @@ class NewGVN {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
- mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred;
+ mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -840,7 +840,7 @@ private:
// Ranking
unsigned int getRank(const Value *) const;
bool shouldSwapOperands(const Value *, const Value *) const;
- bool shouldSwapOperandsForIntrinsic(const Value *, const Value *,
+ bool shouldSwapOperandsForPredicate(const Value *, const Value *,
const IntrinsicInst *I) const;
// Reachability handling.
@@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
Value *AdditionallyUsedValue = CmpOp0;
// Sort the ops.
- if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) {
+ if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) {
std::swap(FirstOp, SecondOp);
Predicate = CmpInst::getSwappedPredicate(Predicate);
AdditionallyUsedValue = CmpOp1;
@@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() {
PredicateToUsers.clear();
MemoryToUsers.clear();
RevisitOnReachabilityChange.clear();
- IntrinsicInstPred.clear();
+ PredicateSwapChoice.clear();
}
// Assign local DFS number mapping to instructions, and leave space for Value
@@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
}
-bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B,
+bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B,
const IntrinsicInst *I) const {
- auto LookupResult = IntrinsicInstPred.find(I);
if (shouldSwapOperands(A, B)) {
- if (LookupResult == IntrinsicInstPred.end())
- IntrinsicInstPred.insert({I, B});
- else
- LookupResult->second = B;
+ PredicateSwapChoice[I] = B;
return true;
}
- if (LookupResult != IntrinsicInstPred.end()) {
+ auto LookupResult = PredicateSwapChoice.find(I);
+ if (LookupResult != PredicateSwapChoice.end()) {
auto *SeenPredicate = LookupResult->second;
if (SeenPredicate) {
+ // We previously decided to swap B to the left. Keep that choice.
if (SeenPredicate == B)
return true;
else
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b9292af..b78c702 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -703,6 +703,7 @@ private:
// Add U as additional user of V.
void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
+ void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI);
void handleCallOverdefined(CallBase &CB);
void handleCallResult(CallBase &CB);
void handleCallArguments(CallBase &CB);
@@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
}
}
+void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf,
+ const PredicateBase *PI) {
+ ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+ const std::optional<PredicateConstraint> &Constraint = PI->getConstraint();
+ if (!Constraint) {
+ mergeInValue(ValueState[I], I, CopyOfVal);
+ return;
+ }
+
+ CmpInst::Predicate Pred = Constraint->Predicate;
+ Value *OtherOp = Constraint->OtherOp;
+
+ // Wait until OtherOp is resolved.
+ if (getValueState(OtherOp).isUnknown()) {
+ addAdditionalUser(OtherOp, I);
+ return;
+ }
+
+ ValueLatticeElement CondVal = getValueState(OtherOp);
+ ValueLatticeElement &IV = ValueState[I];
+ if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+ auto ImposedCR =
+ ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+ // Get the range imposed by the condition.
+ if (CondVal.isConstantRange())
+ ImposedCR = ConstantRange::makeAllowedICmpRegion(
+ Pred, CondVal.getConstantRange());
+
+ // Combine range info for the original value with the new range from the
+ // condition.
+ auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
+ /*UndefAllowed=*/true);
+ // Treat an unresolved input like a full range.
+ if (CopyOfCR.isEmptySet())
+ CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
+ auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+ // If the existing information is != x, do not use the information from
+ // a chained predicate, as the != x information is more likely to be
+ // helpful in practice.
+ if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+ NewCR = CopyOfCR;
+
+ // The new range is based on a branch condition. That guarantees that
+ // neither of the compare operands can be undef in the branch targets,
+ // unless we have conditions that are always true/false (e.g. icmp ule
+ // i32, %a, i32_max). For the latter overdefined/empty range will be
+ // inferred, but the branch will get folded accordingly anyways.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(
+ IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
+ return;
+ } else if (Pred == CmpInst::ICMP_EQ &&
+ (CondVal.isConstant() || CondVal.isNotConstant())) {
+ // For non-integer values or integer constant expressions, only
+ // propagate equal constants or not-constants.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, CondVal);
+ return;
+ } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
+ // Propagate inequalities.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant()));
+ return;
+ }
+
+ return (void)mergeInValue(IV, I, CopyOfVal);
+}
+
void SCCPInstVisitor::handleCallResult(CallBase &CB) {
Function *F = CB.getCalledFunction();
@@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
return;
Value *CopyOf = CB.getOperand(0);
- ValueLatticeElement CopyOfVal = getValueState(CopyOf);
- const auto *PI = getPredicateInfoFor(&CB);
+ const PredicateBase *PI = getPredicateInfoFor(&CB);
assert(PI && "Missing predicate info for ssa.copy");
-
- const std::optional<PredicateConstraint> &Constraint =
- PI->getConstraint();
- if (!Constraint) {
- mergeInValue(ValueState[&CB], &CB, CopyOfVal);
- return;
- }
-
- CmpInst::Predicate Pred = Constraint->Predicate;
- Value *OtherOp = Constraint->OtherOp;
-
- // Wait until OtherOp is resolved.
- if (getValueState(OtherOp).isUnknown()) {
- addAdditionalUser(OtherOp, &CB);
- return;
- }
-
- ValueLatticeElement CondVal = getValueState(OtherOp);
- ValueLatticeElement &IV = ValueState[&CB];
- if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
- auto ImposedCR =
- ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
-
- // Get the range imposed by the condition.
- if (CondVal.isConstantRange())
- ImposedCR = ConstantRange::makeAllowedICmpRegion(
- Pred, CondVal.getConstantRange());
-
- // Combine range info for the original value with the new range from the
- // condition.
- auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
- /*UndefAllowed=*/true);
- // Treat an unresolved input like a full range.
- if (CopyOfCR.isEmptySet())
- CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
- auto NewCR = ImposedCR.intersectWith(CopyOfCR);
- // If the existing information is != x, do not use the information from
- // a chained predicate, as the != x information is more likely to be
- // helpful in practice.
- if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
- NewCR = CopyOfCR;
-
- // The new range is based on a branch condition. That guarantees that
- // neither of the compare operands can be undef in the branch targets,
- // unless we have conditions that are always true/false (e.g. icmp ule
- // i32, %a, i32_max). For the latter overdefined/empty range will be
- // inferred, but the branch will get folded accordingly anyways.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(
- IV, &CB,
- ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
- return;
- } else if (Pred == CmpInst::ICMP_EQ &&
- (CondVal.isConstant() || CondVal.isNotConstant())) {
- // For non-integer values or integer constant expressions, only
- // propagate equal constants or not-constants.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB, CondVal);
- return;
- } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
- // Propagate inequalities.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB,
- ValueLatticeElement::getNot(CondVal.getConstant()));
- return;
- }
-
- return (void)mergeInValue(IV, &CB, CopyOfVal);
+ handlePredicate(&CB, CopyOf, PI);
+ return;
}
if (II->getIntrinsicID() == Intrinsic::vscale) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7b7efb8..fe93fcd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -93,6 +93,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -155,6 +156,7 @@
#include <utility>
using namespace llvm;
+using namespace SCEVPatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// ElementCount to include loops whose trip count is a function of vscale.
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
const Loop *L) {
- return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
+ if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
+ return ElementCount::getFixed(ExpectedTC);
+
+ const SCEV *BTC = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BTC))
+ return ElementCount::getFixed(0);
+
+ const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
+ if (isa<SCEVVScale>(ExitCount))
+ return ElementCount::getScalable(1);
+
+ const APInt *Scale;
+ if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
+ if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
+ if (Scale->getActiveBits() <= 32)
+ return ElementCount::getScalable(Scale->getZExtValue());
+
+ return ElementCount::getFixed(0);
}
/// Returns "best known" trip count, which is either a valid positive trip count
@@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) {
}
}
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// This function attempts to return a value that represents the ElementCount
+/// at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
- std::optional<unsigned> VScale) {
+static unsigned estimateElementCount(ElementCount VF,
+ std::optional<unsigned> VScale) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (VScale)
@@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// use the value of vscale used for tuning.
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
unsigned EstimatedVFxUF =
- getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+ estimateElementCount(VF * UF, Cost->getVScaleForTuning());
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}
@@ -3003,7 +3022,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// is correct. The easiest form of the later is to require that all values
// stored are the same.
return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
- Legal->isInvariant(cast<StoreInst>(I)->getValueOperand()));
+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
- getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
+ estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
@@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
+ return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
MinVFThreshold;
}
@@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
- getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
+ estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
@@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
-
// Try to get the exact trip count, or an estimate based on profiling data or
// ConstantMax from PSE, failing that.
- if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
+ auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
+
+ // For fixed length VFs treat a scalable trip count as unknown.
+ if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
+ // Re-evaluate trip counts and VFs to be in the same numerical space.
+ unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
+ unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);
+
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
- unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
- ? BestKnownTC->getFixedValue() - 1
- : BestKnownTC->getFixedValue();
+ if (requiresScalarEpilogue(VF.isVector()))
+ --AvailableTC;
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
@@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
- unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
+ unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
@@ -7292,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+ // Canonicalize EVL loops after regions are dissolved.
+ VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
@@ -9611,7 +9636,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
- unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
+ unsigned IntVF = estimateElementCount(VF.Width, VScale);
uint64_t RtC = TotalCost.getValue();
uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 225658b..68e7c20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3391,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
- VectorType *InterleaveTy =
- VectorType::get(VecTy->getElementType(),
- VecTy->getElementCount().multiplyCoefficientBy(Factor));
- return Builder.CreateIntrinsic(InterleaveTy,
- getInterleaveIntrinsicID(Factor), Vals,
- /*FMFSource=*/nullptr, Name);
+ return Builder.CreateVectorInterleave(Vals, Name);
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3503,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
NewLoad = State.Builder.CreateIntrinsic(
- getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
- NewLoad,
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8de05c1..47a9ff0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2244,6 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// Try to optimize header mask recipes away to their EVL variants.
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
+ // TODO: Split optimizeMaskToEVL out and move into
+ // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
+ // tryToBuildVPlanWithVPRecipes beforehand.
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
VPRecipeBase *EVLRecipe =
@@ -2265,6 +2268,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
}
+
+ // Replace header masks with a mask equivalent to predicating by EVL:
+ //
+ // icmp ule widen-canonical-iv backedge-taken-count
+ // ->
+ // icmp ult step-vector, EVL
+ VPRecipeBase *EVLR = EVL.getDefiningRecipe();
+ VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
+ Type *EVLType = TypeInfo.inferScalarType(&EVL);
+ VPValue *EVLMask = Builder.createICmp(
+ CmpInst::ICMP_ULT,
+ Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
+ HeaderMask->replaceAllUsesWith(EVLMask);
+ ToErase.push_back(HeaderMask->getDefiningRecipe());
}
for (VPRecipeBase *R : reverse(ToErase)) {
@@ -2373,6 +2390,66 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
return true;
}
+void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
+ using namespace llvm::VPlanPatternMatch;
+ // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
+ // There should be only one EVL PHI in the entire plan.
+ VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry())))
+ for (VPRecipeBase &R : VPBB->phis())
+ if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
+ assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
+ EVLPhi = PhiR;
+ }
+
+ // Early return if no EVL PHI is found.
+ if (!EVLPhi)
+ return;
+
+ VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
+ VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+
+ // Convert EVLPhi to concrete recipe.
+ auto *ScalarR =
+ VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
+ EVLPhi->getDebugLoc(), "evl.based.iv");
+ EVLPhi->replaceAllUsesWith(ScalarR);
+ EVLPhi->eraseFromParent();
+
+ // Replace CanonicalIVInc with EVL-PHI increment.
+ auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+ VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+ assert(match(Backedge,
+ m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV),
+ m_Specific(&Plan.getVFxUF()))) &&
+ "Unexpected canonical iv");
+ Backedge->replaceAllUsesWith(EVLIncrement);
+
+ // Remove unused phi and increment.
+ VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+ CanonicalIVIncrement->eraseFromParent();
+ CanonicalIV->eraseFromParent();
+
+ // Replace the use of VectorTripCount in the latch-exiting block.
+ // Before: (branch-on-count EVLIVInc, VectorTripCount)
+ // After: (branch-on-count EVLIVInc, TripCount)
+
+ VPBasicBlock *LatchExiting =
+ HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
+ auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+ // Skip single-iteration loop region
+ if (match(LatchExitingBr, m_BranchOnCond(m_True())))
+ return;
+ assert(LatchExitingBr &&
+ match(LatchExitingBr,
+ m_BranchOnCount(m_VPValue(EVLIncrement),
+ m_Specific(&Plan.getVectorTripCount()))) &&
+ "Unexpected terminator in EVL loop");
+ LatchExitingBr->setOperand(1, Plan.getTripCount());
+}
+
void VPlanTransforms::dropPoisonGeneratingRecipes(
VPlan &Plan,
const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
@@ -2704,15 +2781,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
- auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
- {PhiR->getStartValue(), PhiR->getBackedgeValue()},
- PhiR->getDebugLoc(), "evl.based.iv");
- PhiR->replaceAllUsesWith(ScalarR);
- ToRemove.push_back(PhiR);
- continue;
- }
-
if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
ToRemove.push_back(WidenIVR);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index d5af6cd..880159f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -209,6 +209,18 @@ struct VPlanTransforms {
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
+ /// Transform EVL loops to use variable-length stepping after region
+ /// dissolution.
+ ///
+ /// Once loop regions are replaced with explicit CFG, EVL loops can step with
+ /// variable vector lengths instead of fixed lengths. This transformation:
+ /// * Makes EVL-Phi concrete.
+ // * Removes CanonicalIV and increment.
+ /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
+ /// VectorTripCount) with variable-length stepping (branch-on-cond
+ /// EVLIVInc, TripCount).
+ static void canonicalizeEVLLoops(VPlan &Plan);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 38ada33..57d01cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -17,6 +17,7 @@
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "VPlanHelpers.h"
+#include "VPlanPatternMatch.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/TypeSwitch.h"
@@ -171,7 +172,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
.Case<VPInstruction>([&](const VPInstruction *I) {
- if (I->getOpcode() == Instruction::PHI)
+ if (I->getOpcode() == Instruction::PHI ||
+ I->getOpcode() == Instruction::ICmp)
return VerifyEVLUse(*I, 1);
switch (I->getOpcode()) {
case Instruction::Add:
@@ -192,7 +194,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
errs() << "EVL used by unexpected VPInstruction\n";
return false;
}
- if (I->getNumUsers() != 1) {
+ // EVLIVIncrement is only used by EVLIV & BranchOnCount.
+ // Having more than two users is unexpected.
+ if ((I->getNumUsers() != 1) &&
+ (I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) {
+ using namespace llvm::VPlanPatternMatch;
+ return match(U, m_BranchOnCount(m_Specific(I), m_VPValue()));
+ }))) {
errs() << "EVL is used in VPInstruction with multiple users\n";
return false;
}