aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CMakeLists.txt1
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp13
-rw-r--r--llvm/lib/Analysis/DXILResource.cpp136
-rw-r--r--llvm/lib/Analysis/LoopInfo.cpp10
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp14
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp4
-rw-r--r--llvm/lib/Analysis/UniformityAnalysis.cpp1
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp36
-rw-r--r--llvm/lib/BinaryFormat/SFrame.cpp33
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp48
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp23
-rw-r--r--llvm/lib/CodeGen/CommandFlags.cpp7
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp24
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp20
-rw-r--r--llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp22
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.cpp2
-rw-r--r--llvm/lib/CodeGen/MIRParser/MILexer.h2
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIParser.cpp12
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIRParser.cpp14
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp14
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp25
-rw-r--r--llvm/lib/CodeGen/MachineInstr.cpp6
-rw-r--r--llvm/lib/CodeGen/MachineOperand.cpp4
-rw-r--r--llvm/lib/CodeGen/ModuloSchedule.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp179
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp138
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp93
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/TailDuplicator.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetInstrInfo.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp14
-rw-r--r--llvm/lib/CodeGen/WindowsSecureHotPatching.cpp13
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h3
-rw-r--r--llvm/lib/Frontend/HLSL/CMakeLists.txt1
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLBinding.cpp142
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp7
-rw-r--r--llvm/lib/IR/IRBuilder.cpp29
-rw-r--r--llvm/lib/IR/Intrinsics.cpp24
-rw-r--r--llvm/lib/IR/Metadata.cpp2
-rw-r--r--llvm/lib/IR/Verifier.cpp16
-rw-r--r--llvm/lib/LTO/LTO.cpp118
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp6
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp7
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp12
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp2
-rw-r--r--llvm/lib/ObjCopy/COFF/COFFReader.cpp2
-rw-r--r--llvm/lib/Object/SFrameParser.cpp72
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp8
-rw-r--r--llvm/lib/Passes/PassRegistry.def2
-rw-r--r--llvm/lib/ProfileData/MemProfReader.cpp71
-rw-r--r--llvm/lib/Support/BLAKE3/CMakeLists.txt3
-rw-r--r--llvm/lib/Support/Debug.cpp69
-rw-r--r--llvm/lib/Support/FileCollector.cpp3
-rw-r--r--llvm/lib/Support/Unix/Path.inc2
-rw-r--r--llvm/lib/Support/VirtualFileSystem.cpp11
-rw-r--r--llvm/lib/Support/Windows/Threading.inc60
-rw-r--r--llvm/lib/TableGen/Record.cpp26
-rw-r--r--llvm/lib/TableGen/TGLexer.cpp2
-rw-r--r--llvm/lib/TableGen/TGLexer.h2
-rw-r--r--llvm/lib/TableGen/TGParser.cpp53
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp135
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp57
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp13
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp47
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp89
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp30
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td134
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h21
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp44
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td198
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td93
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp5
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp7
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp74
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonMask.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp33
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp277
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp670
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h10
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td409
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h3
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrP10.td116
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp38
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp28
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp123
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td52
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp50
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp15
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp50
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp17
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp245
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssembly.td9
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp45
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp28
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h2
-rw-r--r--llvm/lib/Target/X86/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp8
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp22
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp4
-rw-r--r--llvm/lib/TargetParser/Host.cpp13
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp1
-rw-r--r--llvm/lib/TargetParser/Triple.cpp6
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp102
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp2
-rw-r--r--llvm/lib/Transforms/HipStdPar/HipStdPar.cpp118
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp34
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp127
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp148
-rw-r--r--llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp31
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp154
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp8
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp8
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp16
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp51
-rw-r--r--llvm/lib/Transforms/Instrumentation/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp80
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOEstimateTripCounts.cpp45
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp7
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp20
-rw-r--r--llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp184
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp143
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp71
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp101
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp15
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp130
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp17
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp7
202 files changed, 4880 insertions, 2457 deletions
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index cfde787..16dd6f8 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -175,6 +175,7 @@ add_llvm_component_library(LLVMAnalysis
LINK_COMPONENTS
BinaryFormat
Core
+ FrontendHLSL
Object
ProfileData
Support
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 759c553..2d52f34 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1373,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand))
return flushDenormalConstantFP(CFP, Inst, IsOutput);
- if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand))
+ if (isa<ConstantAggregateZero, UndefValue>(Operand))
return Operand;
Type *Ty = Operand->getType();
@@ -1389,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst,
Ty = VecTy->getElementType();
}
+ if (isa<ConstantExpr>(Operand))
+ return Operand;
+
if (const auto *CV = dyn_cast<ConstantVector>(Operand)) {
SmallVector<Constant *, 16> NewElts;
for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
@@ -2628,14 +2631,14 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::nvvm_ceil_d:
return ConstantFoldFP(
ceil, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_fabs_ftz:
case Intrinsic::nvvm_fabs:
return ConstantFoldFP(
fabs, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_floor_ftz_f:
@@ -2643,7 +2646,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::nvvm_floor_d:
return ConstantFoldFP(
floor, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
case Intrinsic::nvvm_rcp_rm_ftz_f:
@@ -2705,7 +2708,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
return nullptr;
return ConstantFoldFP(
sqrt, APF, Ty,
- nvvm::GetNVVMDenromMode(
+ nvvm::GetNVVMDenormMode(
nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
// AMDGCN Intrinsics:
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 1959ab6..629fa7cd 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -995,18 +995,7 @@ SmallVector<dxil::ResourceInfo *> DXILResourceMap::findByUse(const Value *Key) {
//===----------------------------------------------------------------------===//
void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
- struct Binding {
- ResourceClass RC;
- uint32_t Space;
- uint32_t LowerBound;
- uint32_t UpperBound;
- Value *Name;
- Binding(ResourceClass RC, uint32_t Space, uint32_t LowerBound,
- uint32_t UpperBound, Value *Name)
- : RC(RC), Space(Space), LowerBound(LowerBound), UpperBound(UpperBound),
- Name(Name) {}
- };
- SmallVector<Binding> Bindings;
+ hlsl::BindingInfoBuilder Builder;
// collect all of the llvm.dx.resource.handlefrombinding calls;
// make a note if there is llvm.dx.resource.handlefromimplicitbinding
@@ -1036,133 +1025,20 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
assert((Size < 0 || (unsigned)LowerBound + Size - 1 <= UINT32_MAX) &&
"upper bound register overflow");
uint32_t UpperBound = Size < 0 ? UINT32_MAX : LowerBound + Size - 1;
- Bindings.emplace_back(RTI.getResourceClass(), Space, LowerBound,
- UpperBound, Name);
+ Builder.trackBinding(RTI.getResourceClass(), Space, LowerBound,
+ UpperBound, Name);
}
break;
}
case Intrinsic::dx_resource_handlefromimplicitbinding: {
- ImplicitBinding = true;
+ HasImplicitBinding = true;
break;
}
}
}
- // sort all the collected bindings
- llvm::stable_sort(Bindings, [](auto &LHS, auto &RHS) {
- return std::tie(LHS.RC, LHS.Space, LHS.LowerBound) <
- std::tie(RHS.RC, RHS.Space, RHS.LowerBound);
- });
-
- // remove duplicates
- Binding *NewEnd = llvm::unique(Bindings, [](auto &LHS, auto &RHS) {
- return std::tie(LHS.RC, LHS.Space, LHS.LowerBound, LHS.UpperBound,
- LHS.Name) == std::tie(RHS.RC, RHS.Space, RHS.LowerBound,
- RHS.UpperBound, RHS.Name);
- });
- if (NewEnd != Bindings.end())
- Bindings.erase(NewEnd);
-
- // Go over the sorted bindings and build up lists of free register ranges
- // for each binding type and used spaces. Bindings are sorted by resource
- // class, space, and lower bound register slot.
- BindingSpaces *BS = &SRVSpaces;
- for (const Binding &B : Bindings) {
- if (BS->RC != B.RC)
- // move to the next resource class spaces
- BS = &getBindingSpaces(B.RC);
-
- RegisterSpace *S = BS->Spaces.empty() ? &BS->Spaces.emplace_back(B.Space)
- : &BS->Spaces.back();
- assert(S->Space <= B.Space && "bindings not sorted correctly?");
- if (B.Space != S->Space)
- // add new space
- S = &BS->Spaces.emplace_back(B.Space);
-
- // The space is full - there are no free slots left, or the rest of the
- // slots are taken by an unbounded array. Set flag to report overlapping
- // binding later.
- if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < UINT32_MAX) {
- OverlappingBinding = true;
- continue;
- }
-
- // adjust the last free range lower bound, split it in two, or remove it
- BindingRange &LastFreeRange = S->FreeRanges.back();
- if (LastFreeRange.LowerBound == B.LowerBound) {
- if (B.UpperBound < UINT32_MAX)
- LastFreeRange.LowerBound = B.UpperBound + 1;
- else
- S->FreeRanges.pop_back();
- } else if (LastFreeRange.LowerBound < B.LowerBound) {
- LastFreeRange.UpperBound = B.LowerBound - 1;
- if (B.UpperBound < UINT32_MAX)
- S->FreeRanges.emplace_back(B.UpperBound + 1, UINT32_MAX);
- } else {
- OverlappingBinding = true;
- if (B.UpperBound < UINT32_MAX)
- LastFreeRange.LowerBound =
- std::max(LastFreeRange.LowerBound, B.UpperBound + 1);
- else
- S->FreeRanges.pop_back();
- }
- }
-}
-
-// returns std::nulopt if binding could not be found in given space
-std::optional<uint32_t>
-DXILResourceBindingInfo::findAvailableBinding(dxil::ResourceClass RC,
- uint32_t Space, int32_t Size) {
- BindingSpaces &BS = getBindingSpaces(RC);
- RegisterSpace &RS = BS.getOrInsertSpace(Space);
- return RS.findAvailableBinding(Size);
-}
-
-DXILResourceBindingInfo::RegisterSpace &
-DXILResourceBindingInfo::BindingSpaces::getOrInsertSpace(uint32_t Space) {
- for (auto *I = Spaces.begin(); I != Spaces.end(); ++I) {
- if (I->Space == Space)
- return *I;
- if (I->Space < Space)
- continue;
- return *Spaces.insert(I, Space);
- }
- return Spaces.emplace_back(Space);
-}
-
-std::optional<uint32_t>
-DXILResourceBindingInfo::RegisterSpace::findAvailableBinding(int32_t Size) {
- assert((Size == -1 || Size > 0) && "invalid size");
-
- if (FreeRanges.empty())
- return std::nullopt;
-
- // unbounded array
- if (Size == -1) {
- BindingRange &Last = FreeRanges.back();
- if (Last.UpperBound != UINT32_MAX)
- // this space is already occupied by an unbounded array
- return std::nullopt;
- uint32_t RegSlot = Last.LowerBound;
- FreeRanges.pop_back();
- return RegSlot;
- }
-
- // single resource or fixed-size array
- for (BindingRange &R : FreeRanges) {
- // compare the size as uint64_t to prevent overflow for range (0,
- // UINT32_MAX)
- if ((uint64_t)R.UpperBound - R.LowerBound + 1 < (uint64_t)Size)
- continue;
- uint32_t RegSlot = R.LowerBound;
- // This might create a range where (LowerBound == UpperBound + 1). When
- // that happens, the next time this function is called the range will
- // skipped over by the check above (at this point Size is always > 0).
- R.LowerBound += Size;
- return RegSlot;
- }
-
- return std::nullopt;
+ Bindings = Builder.calculateBindingInfo(
+ [this](auto, auto) { this->HasOverlappingBinding = true; });
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 518a634..d2bf4a7 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1111,9 +1111,13 @@ bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
}
std::optional<int> llvm::getOptionalIntLoopAttribute(const Loop *TheLoop,
- StringRef Name) {
- const MDOperand *AttrMD =
- findStringMetadataForLoop(TheLoop, Name).value_or(nullptr);
+ StringRef Name,
+ bool *Missing) {
+ std::optional<const MDOperand *> AttrMDOpt =
+ findStringMetadataForLoop(TheLoop, Name);
+ if (Missing)
+ *Missing = !AttrMDOpt;
+ const MDOperand *AttrMD = AttrMDOpt.value_or(nullptr);
if (!AttrMD)
return std::nullopt;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 0990a0d..477e477 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2682,6 +2682,20 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
return getAddExpr(NewOps, PreservedFlags);
}
}
+
+ // Try to push the constant operand into a ZExt: A + zext (-A + B) -> zext
+ // (B), if trunc (A) + -A + B does not unsigned-wrap.
+ const SCEVAddExpr *InnerAdd;
+ if (match(B, m_scev_ZExt(m_scev_Add(InnerAdd)))) {
+ const SCEV *NarrowA = getTruncateExpr(A, InnerAdd->getType());
+ if (NarrowA == getNegativeSCEV(InnerAdd->getOperand(0)) &&
+ getZeroExtendExpr(NarrowA, B->getType()) == A &&
+ hasFlags(StrengthenNoWrapFlags(this, scAddExpr, {NarrowA, InnerAdd},
+ SCEV::FlagAnyWrap),
+ SCEV::FlagNUW)) {
+ return getZeroExtendExpr(getAddExpr(NarrowA, InnerAdd), B->getType());
+ }
+ }
}
// Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 55ba52a..c7eb2ec 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1486,6 +1486,10 @@ void TargetTransformInfo::collectKernelLaunchBounds(
return TTIImpl->collectKernelLaunchBounds(F, LB);
}
+bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
+ return TTIImpl->allowVectorElementIndexingUsingGEP();
+}
+
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index c871070..7025b83 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const {
Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
@@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const {
Result.TBAA = Result.TBAAStruct = nullptr;
Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+ Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+ NoAliasAddrSpace, Other.NoAliasAddrSpace);
return Result;
}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 15107c2..2e4063f 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -178,6 +178,7 @@ bool UniformityInfoWrapperPass::runOnFunction(Function &F) {
void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
OS << "UniformityInfo for function '" << m_function->getName() << "':\n";
+ m_uniformityInfo.print(OS);
}
void UniformityInfoWrapperPass::releaseMemory() {
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1b3da59..425ea31 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::exp:
case Intrinsic::exp10:
case Intrinsic::exp2:
+ case Intrinsic::ldexp:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::canonicalize:
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
switch (ID) {
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
case Intrinsic::powi:
+ case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
default:
return OpdIdx == -1;
@@ -240,30 +246,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}
-struct InterleaveIntrinsic {
- Intrinsic::ID Interleave, Deinterleave;
-};
-
-static InterleaveIntrinsic InterleaveIntrinsics[] = {
- {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
- {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
- {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
- {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
- {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
- {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
- {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
-};
-
-Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Interleave;
-}
-
-Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- return InterleaveIntrinsics[Factor - 2].Deinterleave;
-}
-
unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::vector_interleave2:
@@ -1141,7 +1123,7 @@ Constant *
llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
const InterleaveGroup<Instruction> &Group) {
// All 1's means mask is not needed.
- if (Group.getNumMembers() == Group.getFactor())
+ if (Group.isFull())
return nullptr;
// TODO: support reversed access.
@@ -1687,7 +1669,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// load would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
- if (Group->getNumMembers() == Group->getFactor())
+ if (Group->isFull())
continue;
// Case 2: If first and last members of the group don't wrap this implies
@@ -1722,7 +1704,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// store would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
- if (Group->getNumMembers() == Group->getFactor())
+ if (Group->isFull())
continue;
// Interleave-store-group with gaps is implemented using masked wide store.
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
index 3b436af..f1765d7 100644
--- a/llvm/lib/BinaryFormat/SFrame.cpp
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -35,3 +35,36 @@ ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() {
};
return ArrayRef(ABIs);
}
+
+ArrayRef<EnumEntry<sframe::FREType>> sframe::getFRETypes() {
+ static constexpr EnumEntry<sframe::FREType> FRETypes[] = {
+#define HANDLE_SFRAME_FRE_TYPE(CODE, NAME) {#NAME, sframe::FREType::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FRETypes);
+}
+
+ArrayRef<EnumEntry<sframe::FDEType>> sframe::getFDETypes() {
+ static constexpr EnumEntry<sframe::FDEType> FDETypes[] = {
+#define HANDLE_SFRAME_FDE_TYPE(CODE, NAME) {#NAME, sframe::FDEType::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FDETypes);
+}
+
+ArrayRef<EnumEntry<sframe::AArch64PAuthKey>> sframe::getAArch64PAuthKeys() {
+ static constexpr EnumEntry<sframe::AArch64PAuthKey> AArch64PAuthKeys[] = {
+#define HANDLE_SFRAME_AARCH64_PAUTH_KEY(CODE, NAME) \
+ {#NAME, sframe::AArch64PAuthKey::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(AArch64PAuthKeys);
+}
+
+ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() {
+ static constexpr EnumEntry<sframe::FREOffset> FREOffsets[] = {
+#define HANDLE_SFRAME_FRE_OFFSET(CODE, NAME) {#NAME, sframe::FREOffset::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(FREOffsets);
+}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 3b3e7a4..dcfd9aa 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2083,22 +2083,55 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
if (TBB == FBB) {
MBB->splice(Loc, TBB, TBB->begin(), TIB);
} else {
+ // Merge the debug locations, and hoist and kill the debug instructions from
+ // both branches. FIXME: We could probably try harder to preserve some debug
+ // instructions (but at least this isn't producing wrong locations).
+ MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc);
+ auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) {
+ assert(DI->isDebugInstr() && "Expected a debug instruction");
+ if (DI->isDebugRef()) {
+ const TargetInstrInfo *TII =
+ MBB->getParent()->getSubtarget().getInstrInfo();
+ const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE);
+ DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0,
+ DI->getDebugVariable(), DI->getDebugExpression());
+ MBB->insert(Loc, &*DI);
+ return;
+ }
+ // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF.
+ if (DI->isDebugPHI()) {
+ DI->eraseFromParent();
+ return;
+ }
+ // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef.
+ if (!DI->isDebugLabel())
+ DI->setDebugValueUndef();
+ DI->moveBefore(&*Loc);
+ };
+
// TIB and FIB point to the end of the regions to hoist/merge in TBB and
// FBB.
MachineBasicBlock::iterator FE = FIB;
MachineBasicBlock::iterator FI = FBB->begin();
for (MachineBasicBlock::iterator TI :
make_early_inc_range(make_range(TBB->begin(), TIB))) {
- // Move debug instructions and pseudo probes without modifying them.
- // FIXME: This is the wrong thing to do for debug locations, which
- // should at least be killed (and hoisted from BOTH blocks).
- if (TI->isDebugOrPseudoInstr()) {
- TI->moveBefore(&*Loc);
+ // Hoist and kill debug instructions from FBB. After this loop FI points
+ // to the next non-debug instruction to hoist (checked in assert after the
+ // TBB debug instruction handling code).
+ while (FI != FE && FI->isDebugInstr())
+ HoistAndKillDbgInstr(FI++);
+
+ // Kill debug instructions before moving.
+ if (TI->isDebugInstr()) {
+ HoistAndKillDbgInstr(TI);
continue;
}
- // Get the next non-meta instruction in FBB.
- FI = skipDebugInstructionsForward(FI, FE, false);
+ // FI and TI now point to identical non-debug instructions.
+ assert(FI != FE && "Unexpected end of FBB range");
+ // Pseudo probes are excluded from the range when identifying foldable
+ // instructions, so we don't expect to see one now.
+ assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range");
// NOTE: The loop above checks CheckKillDead but we can't do that here as
// it modifies some kill markers after the check.
assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) &&
@@ -2111,6 +2144,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
++FI;
}
}
+
FBB->erase(FBB->begin(), FIB);
if (UpdateLiveIns)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 416c56d..f16283b 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2769,6 +2769,29 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
return optimizeGatherScatterInst(II, II->getArgOperand(0));
case Intrinsic::masked_scatter:
return optimizeGatherScatterInst(II, II->getArgOperand(1));
+ case Intrinsic::masked_load:
+ // Treat v1X masked load as load X type.
+ if (auto *VT = dyn_cast<FixedVectorType>(II->getType())) {
+ if (VT->getNumElements() == 1) {
+ Value *PtrVal = II->getArgOperand(0);
+ unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+ if (optimizeMemoryInst(II, PtrVal, VT->getElementType(), AS))
+ return true;
+ }
+ }
+ return false;
+ case Intrinsic::masked_store:
+ // Treat v1X masked store as store X type.
+ if (auto *VT =
+ dyn_cast<FixedVectorType>(II->getArgOperand(0)->getType())) {
+ if (VT->getNumElements() == 1) {
+ Value *PtrVal = II->getArgOperand(1);
+ unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+ if (optimizeMemoryInst(II, PtrVal, VT->getElementType(), AS))
+ return true;
+ }
+ }
+ return false;
}
SmallVector<Value *, 2> PtrOps;
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 9512f79..810dc29 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion)
CGOPT(DebuggerKind, DebuggerTuningOpt)
CGOPT(bool, EnableStackSizeSection)
CGOPT(bool, EnableAddrsig)
+CGOPT(bool, EnableCallGraphSection)
CGOPT(bool, EmitCallSiteInfo)
CGOPT(bool, EnableMachineFunctionSplitter)
CGOPT(bool, EnableStaticDataPartitioning)
@@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
cl::init(false));
CGBINDOPT(EnableAddrsig);
+ static cl::opt<bool> EnableCallGraphSection(
+ "call-graph-section", cl::desc("Emit a call graph section"),
+ cl::init(false));
+ CGBINDOPT(EnableCallGraphSection);
+
static cl::opt<bool> EmitCallSiteInfo(
"emit-call-site-info",
cl::desc(
@@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning();
Options.EmitAddrsig = getEnableAddrsig();
+ Options.EmitCallGraphSection = getEnableCallGraphSection();
Options.EmitCallSiteInfo = getEmitCallSiteInfo();
Options.EnableDebugEntryValues = getEnableDebugEntryValues();
Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8855740f..9b2851e 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
llvm_unreachable("Deinterleave node should already have ReplacementNode");
break;
case ComplexDeinterleavingOperation::Splat: {
- auto *NewTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(Node->Real->getType()));
auto *R = dyn_cast<Instruction>(Node->Real);
auto *I = dyn_cast<Instruction>(Node->Imag);
if (R && I) {
// Splats that are not constant are interleaved where they are located
Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
IRBuilder<> IRB(InsertPoint);
- ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewTy, {Node->Real, Node->Imag});
+ ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
} else {
- ReplacementNode = Builder.CreateIntrinsic(
- Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
+ ReplacementNode =
+ Builder.CreateVectorInterleave({Node->Real, Node->Imag});
}
break;
}
@@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
auto *A = replaceNode(Builder, Node->Operands[0]);
auto *B = replaceNode(Builder, Node->Operands[1]);
- auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
- cast<VectorType>(MaskReal->getType()));
- auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
- NewMaskTy, {MaskReal, MaskImag});
+ auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
ReplacementNode = Builder.CreateSelect(NewMask, A, B);
break;
}
@@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
}
if (!NewInit)
- NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {Init, Constant::getNullValue(VTy)});
+ NewInit =
+ Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation(
auto *OldPHIImag = ReductionInfo[Imag].first;
auto *NewPHI = OldToNewPHI[OldPHIReal];
- auto *VTy = cast<VectorType>(Real->getType());
- auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
-
// We have to interleave initial origin values coming from IncomingBlock
Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming);
Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
IRBuilder<> Builder(Incoming->getTerminator());
- auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {InitReal, InitImag});
+ auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 012d873..9ba1782 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
@@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
for (unsigned I = 0; I < NumValues; ++I) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+ MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy,
+ Offsets[I]);
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
MRI.getType(VRegs[I]),
commonAlignment(BaseAlign, Offsets[I]));
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8f513a..e84ba91 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
const TargetOptions &Options = MF->getTarget().Options;
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- if (CanReassociate &&
- !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+ if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc))
return false;
// Floating-point multiply-add with intermediate rounding.
@@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
if (!HasFMAD && !HasFMA)
return false;
- AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath || HasFMAD;
+ AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dc5dfab..fd38c30 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
for (unsigned i = 0; i < Regs.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(LI);
@@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
for (unsigned i = 0; i < Vals.size(); ++i) {
Register Addr;
- MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+ MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8);
MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
Align BaseAlign = getMemOpAlign(SI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f..d9d3569 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
LargeSplitSize / 8);
Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
- auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst);
auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
SmallPtr, *SmallMMO);
@@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
LLT PtrTy = MRI.getType(PtrReg);
auto OffsetCst = MIRBuilder.buildConstant(
LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
- auto SmallPtr =
- MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
+ auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst);
MachineMemOperand *LargeMMO =
MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
@@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
unsigned ByteOffset = Offset / 8;
Register NewAddrReg;
- MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
+ MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy,
+ ByteOffset);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
@@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
- if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
+ if (MI.getFlag(MachineInstr::FmAfn)) {
unsigned Flags = MI.getFlags();
auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
@@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
if (DstOff != 0) {
auto Offset =
MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
- Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+ Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0);
}
MIB.buildStore(Value, Ptr, *StoreMMO);
@@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
.getReg(0);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
@@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
Register StorePtr = Dst;
if (CurrOffset != 0) {
LLT DstTy = MRI.getType(Dst);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LdVal, StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
@@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT SrcTy = MRI.getType(Src);
auto Offset =
MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
- LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+ LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0);
}
LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
CurrOffset += CopyTy.getSizeInBytes();
@@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
LLT DstTy = MRI.getType(Dst);
auto Offset =
MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
- StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+ StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
CurrOffset += CopyTy.getSizeInBytes();
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 121d7e8..27df7e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -208,11 +208,20 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0,
return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags);
}
+MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res,
+ const SrcOp &Op0,
+ const SrcOp &Op1) {
+ return buildPtrAdd(Res, Op0, Op1,
+ MachineInstr::MIFlag::NoUWrap |
+ MachineInstr::MIFlag::InBounds);
+}
+
std::optional<MachineInstrBuilder>
MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
- const LLT ValueTy, uint64_t Value) {
+ const LLT ValueTy, uint64_t Value,
+ std::optional<unsigned> Flags) {
assert(Res == 0 && "Res is a result argument");
- assert(ValueTy.isScalar() && "invalid offset type");
+ assert(ValueTy.isScalar() && "invalid offset type");
if (Value == 0) {
Res = Op0;
@@ -221,7 +230,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0));
auto Cst = buildConstant(ValueTy, Value);
- return buildPtrAdd(Res, Op0, Cst.getReg(0));
+ return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags);
+}
+
+std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset(
+ Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) {
+ return materializePtrAdd(Res, Op0, ValueTy, Value,
+ MachineInstr::MIFlag::NoUWrap |
+ MachineInstr::MIFlag::InBounds);
}
MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 7153902..8b72c29 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -217,6 +217,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
.Case("nneg", MIToken::kw_nneg)
.Case("disjoint", MIToken::kw_disjoint)
.Case("samesign", MIToken::kw_samesign)
+ .Case("inbounds", MIToken::kw_inbounds)
.Case("nofpexcept", MIToken::kw_nofpexcept)
.Case("unpredictable", MIToken::kw_unpredictable)
.Case("debug-location", MIToken::kw_debug_location)
@@ -616,6 +617,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
.Case("!range", MIToken::md_range)
.Case("!DIExpression", MIToken::md_diexpr)
.Case("!DILocation", MIToken::md_dilocation)
+ .Case("!noalias.addrspace", MIToken::md_noalias_addrspace)
.Default(MIToken::Error);
}
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index d7cd067..0627f17 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -78,6 +78,7 @@ struct MIToken {
kw_nneg,
kw_disjoint,
kw_samesign,
+ kw_inbounds,
kw_debug_location,
kw_debug_instr_number,
kw_dbg_instr_ref,
@@ -151,6 +152,7 @@ struct MIToken {
md_tbaa,
md_alias_scope,
md_noalias,
+ md_noalias_addrspace,
md_range,
md_diexpr,
md_dilocation,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 3a364d5..6a464d9 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1477,7 +1477,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
Token.is(MIToken::kw_nneg) ||
Token.is(MIToken::kw_disjoint) ||
Token.is(MIToken::kw_nusw) ||
- Token.is(MIToken::kw_samesign)) {
+ Token.is(MIToken::kw_samesign) ||
+ Token.is(MIToken::kw_inbounds)) {
// clang-format on
// Mine frame and fast math flags
if (Token.is(MIToken::kw_frame_setup))
@@ -1518,6 +1519,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
Flags |= MachineInstr::NoUSWrap;
if (Token.is(MIToken::kw_samesign))
Flags |= MachineInstr::SameSign;
+ if (Token.is(MIToken::kw_inbounds))
+ Flags |= MachineInstr::InBounds;
lex();
}
@@ -3482,6 +3485,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
if (parseMDNode(AAInfo.NoAlias))
return true;
break;
+ case MIToken::md_noalias_addrspace:
+ lex();
+ if (parseMDNode(AAInfo.NoAliasAddrSpace))
+ return true;
+ break;
case MIToken::md_range:
lex();
if (parseMDNode(Range))
@@ -3490,7 +3498,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
// TODO: Report an error on duplicate metadata nodes.
default:
return error("expected 'align' or '!tbaa' or '!alias.scope' or "
- "'!noalias' or '!range'");
+ "'!noalias' or '!range' or '!noalias.addrspace'");
}
}
if (expectAndConsume(MIToken::rparen))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 1e9fcf3..3e99e57 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo(
return error(Error, ArgRegPair.Reg.SourceRange);
CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
}
+ if (!YamlCSInfo.CalleeTypeIds.empty()) {
+ for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) {
+ IntegerType *Int64Ty = Type::getInt64Ty(Context);
+ CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId,
+ /*isSigned=*/false));
+ }
+ }
- if (TM.Options.EmitCallSiteInfo)
+ if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)
MF.addCallSiteInfo(&*CallI, std::move(CSInfo));
}
- if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
- return error(Twine("Call site info provided but not used"));
+ if (!YamlMF.CallSitesInfo.empty() &&
+ !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection))
+ return error("call site info provided but not used");
return false;
}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bc4e299..ce1834a 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF,
const MachineFunction &MF,
ModuleSlotTracker &MST) {
const auto *TRI = MF.getSubtarget().getRegisterInfo();
- for (auto CSInfo : MF.getCallSitesInfo()) {
+ for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) {
yaml::CallSiteInfo YmlCS;
yaml::MachineInstrLoc CallLocation;
// Prepare instruction position.
- MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
+ MachineBasicBlock::const_instr_iterator CallI = MI->getIterator();
CallLocation.BlockNum = CallI->getParent()->getNumber();
// Get call instruction offset from the beginning of block.
CallLocation.Offset =
std::distance(CallI->getParent()->instr_begin(), CallI);
YmlCS.CallLocation = CallLocation;
+
+ auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo;
// Construct call arguments and theirs forwarding register info.
- for (auto ArgReg : CSInfo.second.ArgRegPairs) {
+ for (auto ArgReg : ArgRegPairs) {
yaml::CallSiteInfo::ArgRegPair YmlArgReg;
YmlArgReg.ArgNo = ArgReg.ArgNo;
printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI);
YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg);
}
+ // Get type ids.
+ for (auto *CalleeTypeId : CalleeTypeIds) {
+ YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue());
+ }
YMF.CallSitesInfo.push_back(std::move(YmlCS));
}
@@ -814,6 +820,8 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
OS << "nusw ";
if (MI.getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ if (MI.getFlag(MachineInstr::InBounds))
+ OS << "inbounds ";
// NOTE: Please add new MIFlags also to the MI_FLAGS_STR in
// llvm/utils/update_mir_test_checks.py.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 429a17a..ec40f6a 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,8 +211,7 @@ void MachineFunction::init() {
ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
- // FIXME: Use Function::hasOptSize().
- if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.getAlign() && !F.hasOptSize())
Alignment = std::max(Alignment,
STI->getTargetLowering()->getPrefFunctionAlignment());
@@ -699,6 +698,26 @@ bool MachineFunction::needsFrameMoves() const {
!F.getParent()->debug_compile_units().empty();
}
+MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
+ // Numeric callee_type ids are only for indirect calls.
+ if (!CB.isIndirectCall())
+ return;
+
+ MDNode *CalleeTypeList = CB.getMetadata(LLVMContext::MD_callee_type);
+ if (!CalleeTypeList)
+ return;
+
+ for (const MDOperand &Op : CalleeTypeList->operands()) {
+ MDNode *TypeMD = cast<MDNode>(Op);
+ MDString *TypeIdStr = cast<MDString>(TypeMD->getOperand(1));
+ // Compute numeric type id from generalized type id string
+ uint64_t TypeIdVal = MD5Hash(TypeIdStr->getString());
+ IntegerType *Int64Ty = Type::getInt64Ty(CB.getContext());
+ CalleeTypeIds.push_back(
+ ConstantInt::get(Int64Ty, TypeIdVal, /*IsSigned=*/false));
+ }
+}
+
namespace llvm {
template<>
@@ -920,7 +939,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
assert(MI->isCandidateForAdditionalCallInfo() &&
"Call site info refers only to call (MI) candidates");
- if (!Target.Options.EmitCallSiteInfo)
+ if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection)
return CallSitesInfo.end();
return CallSitesInfo.find(MI);
}
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index da3665b..79047f7 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -585,6 +585,8 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) {
MIFlags |= MachineInstr::MIFlag::NoUSWrap;
if (GEP->hasNoUnsignedWrap())
MIFlags |= MachineInstr::MIFlag::NoUWrap;
+ if (GEP->isInBounds())
+ MIFlags |= MachineInstr::MIFlag::InBounds;
}
// Copy the nonneg flag.
@@ -1860,8 +1862,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
OS << "nneg ";
if (getFlag(MachineInstr::Disjoint))
OS << "disjoint ";
+ if (getFlag(MachineInstr::NoUSWrap))
+ OS << "nusw ";
if (getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ if (getFlag(MachineInstr::InBounds))
+ OS << "inbounds ";
// Print the opcode name.
if (TII)
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d25169..c612f8de 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
OS << ", !noalias ";
AAInfo.NoAlias->printAsOperand(OS, MST);
}
+ if (AAInfo.NoAliasAddrSpace) {
+ OS << ", !noalias.addrspace ";
+ AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST);
+ }
if (getRanges()) {
OS << ", !range ";
getRanges()->printAsOperand(OS, MST);
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f742c4..21bf052 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -423,7 +423,7 @@ void ModuloScheduleExpander::generateExistingPhis(
// potentially define two values.
unsigned MaxPhis = PrologStage + 2;
if (!InKernel && (int)PrologStage <= LoopValStage)
- MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+ MaxPhis = std::max((int)MaxPhis - LoopValStage, 1);
unsigned NumPhis = std::min(NumStages, MaxPhis);
Register NewReg;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 2d7987a..7ede564 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -306,7 +306,12 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
/// number if it is not zero. If DstReg is a physical register and the
/// existing subregister number of the def / use being updated is not zero,
/// make sure to set it to the correct physical subregister.
- void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+ ///
+ /// If \p SubregToRegSrcInst is not empty, we are coalescing a
+ /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an
+ /// implicit-def of DstReg on instructions that define SrcReg.
+ void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
+ ArrayRef<MachineInstr *> SubregToRegSrcInst = {});
/// If the given machine operand reads only undefined lanes add an undef
/// flag.
@@ -1443,6 +1448,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
// CopyMI may have implicit operands, save them so that we can transfer them
// over to the newly materialized instruction after CopyMI is removed.
+ LaneBitmask NewMIImplicitOpsMask;
SmallVector<MachineOperand, 4> ImplicitOps;
ImplicitOps.reserve(CopyMI->getNumOperands() -
CopyMI->getDesc().getNumOperands());
@@ -1457,6 +1463,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
(MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) &&
"unexpected implicit virtual register def");
ImplicitOps.push_back(MO);
+ if (MO.isDef() && MO.getReg().isVirtual() &&
+ MRI->shouldTrackSubRegLiveness(DstReg))
+ NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
}
}
@@ -1499,14 +1508,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
} else {
assert(MO.getReg() == NewMI.getOperand(0).getReg());
- // We're only expecting another def of the main output, so the range
- // should get updated with the regular output range.
- //
- // FIXME: The range updating below probably needs updating to look at
- // the super register if subranges are tracked.
- assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
- "subrange update for implicit-def of super register may not be "
- "properly handled");
+ // If lanemasks need to be tracked, compile the lanemask of the NewMI
+ // implicit def operands to avoid subranges for the super-regs from
+ // being removed by code later on in this function.
+ if (MRI->shouldTrackSubRegLiveness(MO.getReg()))
+ NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
}
}
}
@@ -1606,7 +1612,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
for (LiveInterval::SubRange &SR : DstInt.subranges()) {
- if ((SR.LaneMask & DstMask).none()) {
+ if ((SR.LaneMask & DstMask).none() &&
+ (SR.LaneMask & NewMIImplicitOpsMask).none()) {
LLVM_DEBUG(dbgs()
<< "Removing undefined SubRange "
<< PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
@@ -1870,11 +1877,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
}
}
-void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
- unsigned SubIdx) {
+void RegisterCoalescer::updateRegDefsUses(
+ Register SrcReg, Register DstReg, unsigned SubIdx,
+ ArrayRef<MachineInstr *> SubregToRegSrcInsts) {
bool DstIsPhys = DstReg.isPhysical();
LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
+ // Coalescing a COPY may expose reads of 'undef' subregisters.
+ // If so, then explicitly propagate 'undef' to those operands.
if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
if (MO.isUndef())
@@ -1891,6 +1901,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
}
}
+ // If DstInt already has a subrange for the unused lanes, then we shouldn't
+ // create duplicate subranges when we update the interval for unused lanes.
+ LaneBitmask DstIntLaneMask;
+ if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+ for (LiveInterval::SubRange &SR : DstInt->subranges())
+ DstIntLaneMask |= SR.LaneMask;
+ }
+
+ // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'.
SmallPtrSet<MachineInstr *, 8> Visited;
for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg),
E = MRI->reg_instr_end();
@@ -1914,6 +1933,80 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
+ bool RequiresImplicitRedef = false;
+ if (!SubregToRegSrcInsts.empty()) {
+ // We can only add an implicit-def and undef if the sub registers match,
+ // e.g.
+ // %0:gr32 = INSTX
+ // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined
+ // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32
+ //
+ // This cannot be transformed into:
+ // %1.sub32:gr64 = INSTX
+ // undef %1.sub8:gr64 = INSTY , implicit-def %1
+ //
+ // Because that would thrash the top 24 bits of %1.sub32.
+ if (is_contained(SubregToRegSrcInsts, UseMI) &&
+ all_of(UseMI->defs(),
+ [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool {
+ if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef())
+ return true;
+ return SubIdx == MO.getSubReg();
+ })) {
+ // Add implicit-def of super-register to express that the whole
+ // register is defined by the instruction.
+ MachineInstrBuilder MIB(*MF, UseMI);
+ MIB.addReg(DstReg, RegState::ImplicitDefine);
+ RequiresImplicitRedef = true;
+ }
+
+ // If the coalesed instruction doesn't fully define the register, we need
+ // to preserve the original super register liveness for SUBREG_TO_REG.
+ //
+ // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
+ // but it introduces liveness for other subregisters. Downstream users may
+ // have been relying on those bits, so we need to ensure their liveness is
+ // captured with a def of other lanes.
+ if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+ // First check if there is sufficient granularity in terms of subranges.
+ LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg());
+ LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+ LaneBitmask UnusedLanes = DstMask & ~UsedLanes;
+ if ((UnusedLanes & ~DstIntLaneMask).any()) {
+ BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+ DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt);
+ DstIntLaneMask |= UnusedLanes;
+ }
+
+ // After duplicating the live ranges for the low/hi bits, we
+ // need to update the subranges of the DstReg interval such that
+ // for a case like this:
+ //
+ // entry:
+ // 16B %1:gpr32 = INSTRUCTION (<=> UseMI)
+ // :
+ // if.then:
+ // 32B %1:gpr32 = MOVIMM32 ..
+ // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32
+ //
+ // Only the MOVIMM32 require a def of the top lanes and any intervals
+ // for the top 32-bits of the def at 16B should be removed.
+ for (LiveInterval::SubRange &SR : DstInt->subranges()) {
+ if (!Writes || RequiresImplicitRedef ||
+ (SR.LaneMask & UnusedLanes).none())
+ continue;
+
+ assert((SR.LaneMask & UnusedLanes) == SR.LaneMask &&
+ "Unexpected lanemask. Subrange needs finer granularity");
+
+ SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false);
+ auto SegmentI = SR.find(UseIdx);
+ if (SegmentI != SR.end())
+ SR.removeSegment(SegmentI, true);
+ }
+ }
+ }
+
// Replace SrcReg with DstReg in all UseMI operands.
for (unsigned Op : Ops) {
MachineOperand &MO = UseMI->getOperand(Op);
@@ -1922,7 +2015,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
// turn a full def into a read-modify-write sub-register def and vice
// versa.
if (SubIdx && MO.isDef())
- MO.setIsUndef(!Reads);
+ MO.setIsUndef(!Reads || RequiresImplicitRedef);
// A subreg use of a partially undef (super) register may be a complete
// undef use now and then has to be marked that way.
@@ -2025,6 +2118,30 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
LIS->shrinkToUses(&LI);
}
+/// For a given use of value \p Idx, it returns the def in the current block,
+/// or otherwise all possible defs in preceding blocks.
+static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks,
+ SmallVector<MachineInstr *> &Instrs,
+ LiveIntervals *LIS, LiveInterval &SrcInt,
+ MachineBasicBlock *MBB, VNInfo *Idx) {
+ if (!Idx->isPHIDef()) {
+ MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def);
+ assert(Def && "Unable to find a def for SUBREG_TO_REG source operand");
+ Instrs.push_back(Def);
+ return true;
+ }
+
+ bool Any = false;
+ if (VisitedBlocks.count(MBB))
+ return false;
+ VisitedBlocks.insert(MBB);
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred,
+ SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred)));
+ }
+ return Any;
+}
+
bool RegisterCoalescer::joinCopy(
MachineInstr *CopyMI, bool &Again,
SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
@@ -2156,6 +2273,35 @@ bool RegisterCoalescer::joinCopy(
});
}
+ SmallVector<MachineInstr *> SubregToRegSrcInsts;
+ if (CopyMI->isSubregToReg()) {
+ // For the case where the copy instruction is a SUBREG_TO_REG, e.g.
+ //
+ // %0:gpr32 = movimm32 ..
+ // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32
+ // ...
+ // %0:gpr32 = COPY <something>
+ //
+ // After joining liveranges, the original `movimm32` will need an
+ // implicit-def to make it explicit that the entire register is written,
+ // i.e.
+ //
+ // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0
+ // ...
+ // undef %0.sub32:gpr64 = COPY <something> // Note that this does not
+ // // require an implicit-def,
+ // // because it has nothing to
+ // // do with the SUBREG_TO_REG.
+ LiveInterval &SrcInt =
+ LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+ SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI);
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks;
+ if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt,
+ CopyMI->getParent(),
+ SrcInt.Query(SubregToRegSlotIdx).valueIn()))
+ llvm_unreachable("SUBREG_TO_REG src requires a def");
+ }
+
ShrinkMask = LaneBitmask::getNone();
ShrinkMainRange = false;
@@ -2225,9 +2371,12 @@ bool RegisterCoalescer::joinCopy(
// Rewrite all SrcReg operands to DstReg.
// Also update DstReg operands to include DstIdx if it is set.
- if (CP.getDstIdx())
+ if (CP.getDstIdx()) {
+ assert(SubregToRegSrcInsts.empty() && "can this happen?");
updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
- updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+ }
+ updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
+ SubregToRegSrcInsts);
// Shrink subregister ranges if necessary.
if (ShrinkMask.any()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3df434..a43020e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/SDPatternMatch.h"
@@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
}
}
- // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
- // than X, and the And doesn't change the lower iX bits, we can move the
- // AssertZext in front of the And and drop the AssertSext.
if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND &&
- N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext &&
isa<ConstantSDNode>(N0.getOperand(1))) {
- SDValue BigA = N0.getOperand(0);
- EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
const APInt &Mask = N0.getConstantOperandAPInt(1);
- if (AssertVT.bitsLT(BigA_AssertVT) &&
- Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
- SDLoc DL(N);
- SDValue NewAssert =
- DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
- return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
- N0.getOperand(1));
+
+ // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
+ // than X, and the And doesn't change the lower iX bits, we can move the
+ // AssertZext in front of the And and drop the AssertSext.
+ if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) {
+ SDValue BigA = N0.getOperand(0);
+ EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+ if (AssertVT.bitsLT(BigA_AssertVT) &&
+ Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
+ SDLoc DL(N);
+ SDValue NewAssert =
+ DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
+ return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
+ N0.getOperand(1));
+ }
}
+
+ // Remove AssertZext entirely if the mask guarantees the assertion cannot
+ // fail.
+ // TODO: Use KB countMinLeadingZeros to handle non-constant masks?
+ if (Mask.isIntN(AssertVT.getScalarSizeInBits()))
+ return N0;
}
return SDValue();
@@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
// If we store purely within object bounds just before its lifetime ends,
// we can remove the store.
- if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
- StoreSize.getFixedValue() * 8)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (LifetimeEndBase.contains(
+ DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8,
+ StoreBase, StoreSize.getFixedValue() * 8)) {
LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
dbgs() << "\nwithin LIFETIME_END of : ";
LifetimeEndBase.dump(); dbgs() << "\n");
@@ -28971,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
return SDValue();
}
+static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ // Match a pattern such as:
+ // (X | (X >> C0) | (X >> C1) | ...) & Mask
+ // This extracts contiguous parts of X and ORs them together before comparing.
+ // We can optimize this so that we directly check (X & SomeMask) instead,
+ // eliminating the shifts.
+
+ EVT VT = Root.getValueType();
+
+ // TODO: Support vectors?
+ if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N0 = Root.getOperand(0);
+ SDValue N1 = Root.getOperand(1);
+
+ if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
+ return SDValue();
+
+ APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
+
+ SDValue Src;
+ const auto IsSrc = [&](SDValue V) {
+ if (!Src) {
+ Src = V;
+ return true;
+ }
+
+ return Src == V;
+ };
+
+ SmallVector<SDValue> Worklist = {N0};
+ APInt PartsMask(VT.getSizeInBits(), 0);
+ while (!Worklist.empty()) {
+ SDValue V = Worklist.pop_back_val();
+ if (!V.hasOneUse() && (Src && Src != V))
+ return SDValue();
+
+ if (V.getOpcode() == ISD::OR) {
+ Worklist.push_back(V.getOperand(0));
+ Worklist.push_back(V.getOperand(1));
+ continue;
+ }
+
+ if (V.getOpcode() == ISD::SRL) {
+ SDValue ShiftSrc = V.getOperand(0);
+ SDValue ShiftAmt = V.getOperand(1);
+
+ if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
+ return SDValue();
+
+ auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal();
+ if (ShiftAmtVal > RootMask.getBitWidth())
+ return SDValue();
+
+ PartsMask |= (RootMask << ShiftAmtVal);
+ continue;
+ }
+
+ if (IsSrc(V)) {
+ PartsMask |= RootMask;
+ continue;
+ }
+
+ return SDValue();
+ }
+
+ if (!Src)
+ return SDValue();
+
+ SDLoc DL(Root);
+ return DAG.getNode(ISD::AND, DL, VT,
+ {Src, DAG.getConstant(PartsMask, DL, VT)});
+}
+
/// This is a stub for TargetLowering::SimplifySetCC.
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
bool foldBooleans) {
TargetLowering::DAGCombinerInfo
DagCombineInfo(DAG, Level, false, this);
- return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+ if (SDValue C =
+ TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
+ return C;
+
+ if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND &&
+ isNullConstant(N1)) {
+
+ if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
+ return DAG.getSetCC(DL, VT, Res, N1, Cond);
+ }
+
+ return SDValue();
}
/// Given an ISD::SDIV node expressing a divide by constant, return
@@ -29415,7 +29513,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
MachineMemOperand *MMO;
};
- auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
+ auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics {
if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
int64_t Offset = 0;
if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
@@ -29428,13 +29526,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
LSN->getBasePtr(), Offset /*base offset*/,
LocationSize::precise(Size), LSN->getMemOperand()};
}
- if (const auto *LN = cast<LifetimeSDNode>(N))
+ if (const auto *LN = cast<LifetimeSDNode>(N)) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
return {false /*isVolatile*/,
/*isAtomic*/ false,
LN->getOperand(1),
0,
- LocationSize::precise(LN->getSize()),
+ LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())),
(MachineMemOperand *)nullptr};
+ }
// Default.
return {false /*isvolatile*/,
/*isAtomic*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74172b2..ba0ab23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
break;
case ISD::FP_TO_FP16:
LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
- if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+ if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) {
SDValue Op = Node->getOperand(0);
MVT SVT = Op.getSimpleValueType();
if ((SVT == MVT::f64 || SVT == MVT::f80) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 6a2e782..31e7855 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -888,7 +888,8 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
}
if (MI->isCandidateForAdditionalCallInfo()) {
- if (DAG->getTarget().Options.EmitCallSiteInfo)
+ if (DAG->getTarget().Options.EmitCallSiteInfo ||
+ DAG->getTarget().Options.EmitCallGraphSection)
MF.addCallSiteInfo(MI, DAG->getCallSiteInfo(Node));
if (auto CalledGlobal = DAG->getCalledGlobal(Node))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 773ff48..02d1100 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
case ISD::TargetFrameIndex:
ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
break;
- case ISD::LIFETIME_START:
- case ISD::LIFETIME_END:
- ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
- break;
case ISD::PSEUDO_PROBE:
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
@@ -7847,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
}
- // Perform trivial constant folding.
- if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
- return SV;
+ if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) {
+ switch (Opcode) {
+ case ISD::XOR:
+ case ISD::ADD:
+ case ISD::PTRADD:
+ case ISD::SUB:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::UDIV:
+ case ISD::SDIV:
+ case ISD::UREM:
+ case ISD::SREM:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::UMIN:
+ case ISD::OR:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::UMAX:
+ case ISD::SMAX:
+ case ISD::SMIN:
+ // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison.
+ return N2.getOpcode() == ISD::POISON ? N2 : N1;
+ }
+ }
// Canonicalize an UNDEF to the RHS, even over a constant.
- if (N1.isUndef()) {
+ if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) {
if (TLI->isCommutativeBinOp(Opcode)) {
std::swap(N1, N2);
} else {
switch (Opcode) {
case ISD::PTRADD:
case ISD::SUB:
- // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(undef, non_undef_arg2) -> undef.
+ return N1;
case ISD::SIGN_EXTEND_INREG:
case ISD::UDIV:
case ISD::SDIV:
@@ -7868,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::SREM:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ // fold op(undef, non_undef_arg2) -> 0.
+ return getConstant(0, DL, VT);
}
}
}
// Fold a bunch of operators when the RHS is undef.
- if (N2.isUndef()) {
+ if (N2.getOpcode() == ISD::UNDEF) {
switch (Opcode) {
case ISD::XOR:
- if (N1.isUndef())
+ if (N1.getOpcode() == ISD::UNDEF)
// Handle undef ^ undef -> 0 special case. This is a common
// idiom (misuse).
return getConstant(0, DL, VT);
@@ -7887,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::ADD:
case ISD::PTRADD:
case ISD::SUB:
+ // fold op(arg1, undef) -> undef.
+ return N2;
case ISD::UDIV:
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
- // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(arg1, undef) -> poison.
+ return getPOISON(VT);
case ISD::MUL:
case ISD::AND:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ case ISD::UMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT);
case ISD::OR:
case ISD::SADDSAT:
case ISD::UADDSAT:
- // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) ->
- // poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getAllOnesConstant(DL, VT);
+ case ISD::UMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT);
+ case ISD::SMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL,
+ VT);
+ case ISD::SMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL,
+ VT);
}
}
+ // Perform trivial constant folding.
+ if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
+ return SV;
+
// Memoize this node if possible.
SDNode *N;
SDVTList VTs = getVTList(VT);
@@ -9360,8 +9397,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
}
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
- SDValue Chain, int FrameIndex,
- int64_t Size) {
+ SDValue Chain, int FrameIndex) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
@@ -9373,13 +9409,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
- ID.AddInteger(Size);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
- LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
- dl.getDebugLoc(), VTs, Size);
+ LifetimeSDNode *N =
+ newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1636465..306e068 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) {
// FPTrunc is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
SDLoc dl = getCurSDLoc();
+ SDNodeFlags Flags;
+ if (auto *TruncInst = dyn_cast<FPMathOperator>(&I))
+ Flags.copyFMF(*TruncInst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
DAG.getTargetConstant(
- 0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+ 0, dl, TLI.getPointerTy(DAG.getDataLayout())),
+ Flags));
}
void SelectionDAGBuilder::visitFPExt(const User &I) {
@@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
if (TM.getOptLevel() == CodeGenOptLevel::None)
return;
- const int64_t ObjectSize =
- cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
// First check that the Alloca is static, otherwise it won't have a
@@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
const int FrameIndex = SI->second;
- Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+ Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex);
DAG.setRoot(Res);
return;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9474587..900da76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
<< " -> "
<< ASC->getDestAddressSpace()
<< ']';
- } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
- OS << "<0 to " << LN->getSize() << ">";
} else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
OS << '<' << AA->getAlign().value() << '>';
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910..48d6b99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9471,7 +9471,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
ISD::SRL, DL, VT,
DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg),
DAG.getConstant(DeBruijn, DL, VT)),
- DAG.getConstant(ShiftAmt, DL, VT));
+ DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD));
SmallVector<uint8_t> Table(BitWidth, 0);
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57f..d319a97 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,7 +604,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
bool HasComputedGoto = false;
if (!TailBB.empty()) {
HasIndirectbr = TailBB.back().isIndirectBranch();
- HasComputedGoto = TailBB.terminatorIsComputedGoto();
+ HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
}
if (HasIndirectbr && PreRegAlloc)
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 18d6bbc..705e046e 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1406,7 +1406,7 @@ void TargetInstrInfo::reassociateOps(
const MCInstrDesc &MCID, Register DestReg) {
return MachineInstrBuilder(
MF, MF.CreateMachineInstr(MCID, MIMD.getDL(), /*NoImpl=*/true))
- .setPCSections(MIMD.getPCSections())
+ .copyMIMetadata(MIMD)
.addReg(DestReg, RegState::Define);
};
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 68b8a00..3c91b0e 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2062,7 +2062,7 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
// FreeBSD has "__stack_chk_guard" defined externally on libc.so
if (M.getDirectAccessExternalData() &&
- !TM.getTargetTriple().isWindowsGNUEnvironment() &&
+ !TM.getTargetTriple().isOSCygMing() &&
!(TM.getTargetTriple().isPPC64() &&
TM.getTargetTriple().isOSFreeBSD()) &&
(!TM.getTargetTriple().isOSDarwin() ||
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 725e951..e9172f4 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1060,27 +1060,27 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
auto &Context = getContext();
if (Kind.isMergeableConst4() && MergeableConst4Section)
- return Context.getELFSection(".rodata.cst4." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst4." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 4);
if (Kind.isMergeableConst8() && MergeableConst8Section)
- return Context.getELFSection(".rodata.cst8." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst8." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 8);
if (Kind.isMergeableConst16() && MergeableConst16Section)
- return Context.getELFSection(".rodata.cst16." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst16." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 16);
if (Kind.isMergeableConst32() && MergeableConst32Section)
- return Context.getELFSection(".rodata.cst32." + SectionSuffix,
+ return Context.getELFSection(".rodata.cst32." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_MERGE, 32);
if (Kind.isReadOnly())
- return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC);
+ return Context.getELFSection(".rodata." + SectionSuffix + ".",
+ ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
- return Context.getELFSection(".data.rel.ro." + SectionSuffix,
+ return Context.getELFSection(".data.rel.ro." + SectionSuffix + ".",
ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_WRITE);
}
diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
index 6267207..fd54190 100644
--- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
+++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
@@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable(
AddrOfOldGV, Twine("__ref_").concat(GV->getName()),
nullptr, GlobalVariable::NotThreadLocal);
+ // RefGV is created with isConstant = false, but we want to place RefGV into
+ // .rdata, not .data. It is important that the GlobalVariable be mutable
+ // from the compiler's point of view, so that the optimizer does not remove
+ // the global variable entirely and replace all references to it with its
+ // initial value.
+ //
+ // When the Windows hot-patch loader applies a hot-patch, it maps the
+ // pages of .rdata as read/write so that it can set each __ref_* variable
+ // to point to the original variable in the base image. Afterward, pages in
+ // .rdata are remapped as read-only. This protects the __ref_* variables from
+ // being overwritten during execution.
+ RefGV->setSection(".rdata");
+
// Create debug info for the replacement global variable.
DataLayout Layout = M->getDataLayout();
DIType *DebugType = DebugInfo.createPointerType(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index bd0d72f..0e95369 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -157,8 +157,7 @@ private:
processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
const MachOObjectFile &BaseObj,
ObjSectionToIDMap &ObjSectionToID) {
- const MachOObjectFile &Obj =
- static_cast<const MachOObjectFile&>(BaseObj);
+ const MachOObjectFile &Obj = BaseObj;
MachO::any_relocation_info RE =
Obj.getRelocation(RelI->getRawDataRefImpl());
diff --git a/llvm/lib/Frontend/HLSL/CMakeLists.txt b/llvm/lib/Frontend/HLSL/CMakeLists.txt
index 5343469..3d22577 100644
--- a/llvm/lib/Frontend/HLSL/CMakeLists.txt
+++ b/llvm/lib/Frontend/HLSL/CMakeLists.txt
@@ -1,5 +1,6 @@
add_llvm_component_library(LLVMFrontendHLSL
CBuffer.cpp
+ HLSLBinding.cpp
HLSLResource.cpp
HLSLRootSignature.cpp
RootSignatureMetadata.cpp
diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
new file mode 100644
index 0000000..d581311
--- /dev/null
+++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
@@ -0,0 +1,142 @@
+//===- HLSLBinding.cpp - Representation for resource bindings in HLSL -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/HLSL/HLSLBinding.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+std::optional<uint32_t>
+BindingInfo::findAvailableBinding(dxil::ResourceClass RC, uint32_t Space,
+ int32_t Size) {
+ BindingSpaces &BS = getBindingSpaces(RC);
+ RegisterSpace &RS = BS.getOrInsertSpace(Space);
+ return RS.findAvailableBinding(Size);
+}
+
+BindingInfo::RegisterSpace &
+BindingInfo::BindingSpaces::getOrInsertSpace(uint32_t Space) {
+ for (auto It = Spaces.begin(), End = Spaces.end(); It != End; ++It) {
+ if (It->Space == Space)
+ return *It;
+ if (It->Space < Space)
+ continue;
+ return *Spaces.insert(It, Space);
+ }
+ return Spaces.emplace_back(Space);
+}
+
+std::optional<uint32_t>
+BindingInfo::RegisterSpace::findAvailableBinding(int32_t Size) {
+ assert((Size == -1 || Size > 0) && "invalid size");
+
+ if (FreeRanges.empty())
+ return std::nullopt;
+
+ // unbounded array
+ if (Size == -1) {
+ BindingRange &Last = FreeRanges.back();
+ if (Last.UpperBound != ~0u)
+ // this space is already occupied by an unbounded array
+ return std::nullopt;
+ uint32_t RegSlot = Last.LowerBound;
+ FreeRanges.pop_back();
+ return RegSlot;
+ }
+
+ // single resource or fixed-size array
+ for (BindingRange &R : FreeRanges) {
+ // compare the size as uint64_t to prevent overflow for range (0, ~0u)
+ if ((uint64_t)R.UpperBound - R.LowerBound + 1 < (uint64_t)Size)
+ continue;
+ uint32_t RegSlot = R.LowerBound;
+ // This might create a range where (LowerBound == UpperBound + 1). When
+ // that happens, the next time this function is called the range will
+ // skipped over by the check above (at this point Size is always > 0).
+ R.LowerBound += Size;
+ return RegSlot;
+ }
+
+ return std::nullopt;
+}
+
+BindingInfo BindingInfoBuilder::calculateBindingInfo(
+ llvm::function_ref<void(const BindingInfoBuilder &Builder,
+ const Binding &Overlapping)>
+ ReportOverlap) {
+ // sort all the collected bindings
+ llvm::stable_sort(Bindings);
+
+ // remove duplicates
+ Binding *NewEnd = llvm::unique(Bindings);
+ if (NewEnd != Bindings.end())
+ Bindings.erase(NewEnd);
+
+ BindingInfo Info;
+
+ // Go over the sorted bindings and build up lists of free register ranges
+ // for each binding type and used spaces. Bindings are sorted by resource
+ // class, space, and lower bound register slot.
+ BindingInfo::BindingSpaces *BS =
+ &Info.getBindingSpaces(dxil::ResourceClass::SRV);
+ for (const Binding &B : Bindings) {
+ if (BS->RC != B.RC)
+ // move to the next resource class spaces
+ BS = &Info.getBindingSpaces(B.RC);
+
+ BindingInfo::RegisterSpace *S = BS->Spaces.empty()
+ ? &BS->Spaces.emplace_back(B.Space)
+ : &BS->Spaces.back();
+ assert(S->Space <= B.Space && "bindings not sorted correctly?");
+ if (B.Space != S->Space)
+ // add new space
+ S = &BS->Spaces.emplace_back(B.Space);
+
+ // The space is full - there are no free slots left, or the rest of the
+ // slots are taken by an unbounded array. Report the overlapping to the
+ // caller.
+ if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < ~0u) {
+ ReportOverlap(*this, B);
+ continue;
+ }
+ // adjust the last free range lower bound, split it in two, or remove it
+ BindingInfo::BindingRange &LastFreeRange = S->FreeRanges.back();
+ if (LastFreeRange.LowerBound == B.LowerBound) {
+ if (B.UpperBound < ~0u)
+ LastFreeRange.LowerBound = B.UpperBound + 1;
+ else
+ S->FreeRanges.pop_back();
+ } else if (LastFreeRange.LowerBound < B.LowerBound) {
+ LastFreeRange.UpperBound = B.LowerBound - 1;
+ if (B.UpperBound < ~0u)
+ S->FreeRanges.emplace_back(B.UpperBound + 1, ~0u);
+ } else {
+ // We don't have room here. Report the overlapping binding to the caller
+ // and mark any extra space this binding would use as unavailable.
+ ReportOverlap(*this, B);
+ if (B.UpperBound < ~0u)
+ LastFreeRange.LowerBound =
+ std::max(LastFreeRange.LowerBound, B.UpperBound + 1);
+ else
+ S->FreeRanges.pop_back();
+ }
+ }
+
+ return Info;
+}
+
+const BindingInfoBuilder::Binding &BindingInfoBuilder::findOverlapping(
+ const BindingInfoBuilder::Binding &ReportedBinding) const {
+ for (const BindingInfoBuilder::Binding &Other : Bindings)
+ if (ReportedBinding.LowerBound <= Other.UpperBound &&
+ Other.LowerBound <= ReportedBinding.UpperBound)
+ return Other;
+
+ llvm_unreachable("Searching for overlap for binding that does not overlap");
+}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7928772..3aa4f7a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1161,7 +1161,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
Builder.restoreIP(AllocaIP);
auto *KernelArgsPtr =
Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
llvm::Value *Arg =
@@ -1189,7 +1189,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
if (!updateToLocation(Loc))
return Loc.IP;
- Builder.restoreIP(Loc.IP);
// On top of the arrays that were filled up, the target offloading call
// takes as arguments the device id as well as the host pointer. The host
// pointer is used by the runtime library to identify the current target
@@ -5955,7 +5954,7 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
Builder.restoreIP(AllocaIP);
AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
ArgsBase->setAlignment(Align(8));
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
// Store the index value with offset in depend vector.
for (unsigned I = 0; I < NumLoops; ++I) {
@@ -8081,7 +8080,7 @@ void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
".offload_ptrs");
AllocaInst *ArgSizes = Builder.CreateAlloca(
ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
- Builder.restoreIP(Loc.IP);
+ updateToLocation(Loc);
MapperAllocas.ArgsBase = ArgsBase;
MapperAllocas.Args = Args;
MapperAllocas.ArgSizes = ArgSizes;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 28037d7..49c6dc7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
return CreateShuffleVector(V, Zeros, Name + ".splat");
}
-Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
- Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex,
- MDNode *DbgInfo) {
+Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops,
+ const Twine &Name) {
+ assert(Ops.size() >= 2 && Ops.size() <= 8 &&
+ "Unexpected number of operands to interleave");
+
+ // Make sure all operands are the same type.
+ assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type");
+
+#ifndef NDEBUG
+ for (unsigned I = 1; I < Ops.size(); I++) {
+ assert(Ops[I]->getType() == Ops[0]->getType() &&
+ "Vector interleave expects matching operand types!");
+ }
+#endif
+
+ unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size());
+ auto *SubvecTy = cast<VectorType>(Ops[0]->getType());
+ Type *DestTy = VectorType::get(SubvecTy->getElementType(),
+ SubvecTy->getElementCount() * Ops.size());
+ return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name);
+}
+
+Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base,
+ unsigned Dimension,
+ unsigned LastIndex,
+ MDNode *DbgInfo) {
auto *BaseType = Base->getType();
assert(isa<PointerType>(BaseType) &&
"Invalid Base ptr type for preserve.array.access.index.");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 6c35ade..58a1f74 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
"Shouldn't change the signature");
return NewDecl;
}
+
+struct InterleaveIntrinsic {
+ Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+ {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+ {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+ {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+ {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+ {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+ {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+ {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 0dbd07f..1157cbe 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const {
Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct);
Result.Scope = Info.lookup(LLVMContext::MD_alias_scope);
Result.NoAlias = Info.lookup(LLVMContext::MD_noalias);
+ Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace);
}
return Result;
}
@@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) {
setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct);
setMetadata(LLVMContext::MD_alias_scope, N.Scope);
setMetadata(LLVMContext::MD_noalias, N.NoAlias);
+ setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace);
}
void Instruction::setNoSanitizeMetadata() {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3ff9895..924b5da 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -121,6 +121,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -1071,6 +1072,21 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
}
}
+ // Check llvm.loop.estimated_trip_count.
+ if (MD.getNumOperands() > 0 &&
+ MD.getOperand(0).equalsStr(LLVMLoopEstimatedTripCount)) {
+ Check(MD.getNumOperands() == 1 || MD.getNumOperands() == 2,
+ "Expected one or two operands", &MD);
+ if (MD.getNumOperands() == 2) {
+ auto *Count = dyn_cast_or_null<ConstantAsMetadata>(MD.getOperand(1));
+ Check(Count && Count->getType()->isIntegerTy() &&
+ cast<IntegerType>(Count->getType())->getBitWidth() <= 32,
+ "Expected optional second operand to be an integer constant of "
+ "type i32 or smaller",
+ &MD);
+ }
+ }
+
// Check these last, so we diagnose problems in operands first.
Check(!MD.isTemporary(), "Expected no forward declarations!", &MD);
Check(MD.isResolved(), "All nodes should be resolved!", &MD);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 73e79c0..0323b4d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/LTO/LTO.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StableHashing.h"
@@ -742,18 +743,19 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
Conf.VisibilityScheme = Config::ELF;
}
- const SymbolResolution *ResI = Res.begin();
- for (unsigned I = 0; I != Input->Mods.size(); ++I)
- if (Error Err = addModule(*Input, I, ResI, Res.end()))
+ ArrayRef<SymbolResolution> InputRes = Res;
+ for (unsigned I = 0; I != Input->Mods.size(); ++I) {
+ if (auto Err = addModule(*Input, InputRes, I, Res).moveInto(Res))
return Err;
+ }
- assert(ResI == Res.end());
+ assert(Res.empty());
return Error::success();
}
-Error LTO::addModule(InputFile &Input, unsigned ModI,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
+Expected<ArrayRef<SymbolResolution>>
+LTO::addModule(InputFile &Input, ArrayRef<SymbolResolution> InputRes,
+ unsigned ModI, ArrayRef<SymbolResolution> Res) {
Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
if (!LTOInfo)
return LTOInfo.takeError();
@@ -782,28 +784,32 @@ Error LTO::addModule(InputFile &Input, unsigned ModI,
bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular);
auto ModSyms = Input.module_symbols(ModI);
- addModuleToGlobalRes(ModSyms, {ResI, ResE},
+ addModuleToGlobalRes(ModSyms, Res,
IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0,
LTOInfo->HasSummary);
if (IsThinLTO)
- return addThinLTO(BM, ModSyms, ResI, ResE);
+ return addThinLTO(BM, ModSyms, Res);
RegularLTO.EmptyCombinedModule = false;
- Expected<RegularLTOState::AddedModule> ModOrErr =
- addRegularLTO(BM, ModSyms, ResI, ResE);
+ auto ModOrErr = addRegularLTO(Input, InputRes, BM, ModSyms, Res);
if (!ModOrErr)
return ModOrErr.takeError();
+ Res = ModOrErr->second;
- if (!LTOInfo->HasSummary)
- return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false);
+ if (!LTOInfo->HasSummary) {
+ if (Error Err = linkRegularLTO(std::move(ModOrErr->first),
+ /*LivenessFromIndex=*/false))
+ return Err;
+ return Res;
+ }
// Regular LTO module summaries are added to a dummy module that represents
// the combined regular LTO module.
if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, ""))
return Err;
- RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr));
- return Error::success();
+ RegularLTO.ModsWithSummaries.push_back(std::move(ModOrErr->first));
+ return Res;
}
// Checks whether the given global value is in a non-prevailing comdat
@@ -839,10 +845,11 @@ handleNonPrevailingComdat(GlobalValue &GV,
// Add a regular LTO object to the link.
// The resulting module needs to be linked into the combined LTO module with
// linkRegularLTO.
-Expected<LTO::RegularLTOState::AddedModule>
-LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
+Expected<
+ std::pair<LTO::RegularLTOState::AddedModule, ArrayRef<SymbolResolution>>>
+LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes,
+ BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
+ ArrayRef<SymbolResolution> Res) {
RegularLTOState::AddedModule Mod;
Expected<std::unique_ptr<Module>> MOrErr =
BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true,
@@ -855,13 +862,34 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
if (Error Err = M.materializeMetadata())
return std::move(Err);
- // If cfi.functions is present and we are in regular LTO mode, LowerTypeTests
- // will rename local functions in the merged module as "<function name>.1".
- // This causes linking errors, since other parts of the module expect the
- // original function name.
- if (LTOMode == LTOK_UnifiedRegular)
+ if (LTOMode == LTOK_UnifiedRegular) {
+ // cfi.functions metadata is intended to be used with ThinLTO and may
+ // trigger invalid IR transformations if they are present when doing regular
+ // LTO, so delete it.
if (NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"))
M.eraseNamedMetadata(CfiFunctionsMD);
+ } else if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+ // Delete aliases entries for non-prevailing symbols on the ThinLTO side of
+ // this input file.
+ DenseSet<StringRef> Prevailing;
+ for (auto [I, R] : zip(Input.symbols(), InputRes))
+ if (R.Prevailing && !I.getIRName().empty())
+ Prevailing.insert(I.getIRName());
+ std::vector<MDNode *> AliasGroups;
+ for (MDNode *AliasGroup : AliasesMD->operands()) {
+ std::vector<Metadata *> Aliases;
+ for (Metadata *Alias : AliasGroup->operands()) {
+ if (isa<MDString>(Alias) &&
+ Prevailing.count(cast<MDString>(Alias)->getString()))
+ Aliases.push_back(Alias);
+ }
+ if (Aliases.size() > 1)
+ AliasGroups.push_back(MDTuple::get(RegularLTO.Ctx, Aliases));
+ }
+ AliasesMD->clearOperands();
+ for (MDNode *G : AliasGroups)
+ AliasesMD->addOperand(G);
+ }
UpgradeDebugInfo(M);
@@ -899,22 +927,22 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
std::set<const Comdat *> NonPrevailingComdats;
SmallSet<StringRef, 2> NonPrevailingAsmSymbols;
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResI != ResE);
- SymbolResolution Res = *ResI++;
+ assert(!Res.empty());
+ const SymbolResolution &R = Res.consume_front();
assert(MsymI != MsymE);
ModuleSymbolTable::Symbol Msym = *MsymI++;
Skip();
if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) {
- if (Res.Prevailing) {
+ if (R.Prevailing) {
if (Sym.isUndefined())
continue;
Mod.Keep.push_back(GV);
// For symbols re-defined with linker -wrap and -defsym options,
// set the linkage to weak to inhibit IPO. The linkage will be
// restored by the linker.
- if (Res.LinkerRedefined)
+ if (R.LinkerRedefined)
GV->setLinkage(GlobalValue::WeakAnyLinkage);
GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
@@ -938,7 +966,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
// Set the 'local' flag based on the linker resolution for this symbol.
- if (Res.FinalDefinitionInLinkageUnit) {
+ if (R.FinalDefinitionInLinkageUnit) {
GV->setDSOLocal(true);
if (GV->hasDLLImportStorageClass())
GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::
@@ -947,7 +975,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
} else if (auto *AS =
dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) {
// Collect non-prevailing symbols.
- if (!Res.Prevailing)
+ if (!R.Prevailing)
NonPrevailingAsmSymbols.insert(AS->first);
} else {
llvm_unreachable("unknown symbol type");
@@ -965,7 +993,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
CommonRes.Alignment =
std::max(Align(SymAlignValue), CommonRes.Alignment);
}
- CommonRes.Prevailing |= Res.Prevailing;
+ CommonRes.Prevailing |= R.Prevailing;
}
}
@@ -991,7 +1019,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
assert(MsymI == MsymE);
- return std::move(Mod);
+ return std::make_pair(std::move(Mod), Res);
}
Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
@@ -1032,19 +1060,19 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
}
// Add a ThinLTO module to the link.
-Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
- const SymbolResolution *&ResI,
- const SymbolResolution *ResE) {
- const SymbolResolution *ResITmp = ResI;
+Expected<ArrayRef<SymbolResolution>>
+LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
+ ArrayRef<SymbolResolution> Res) {
+ ArrayRef<SymbolResolution> ResTmp = Res;
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResITmp != ResE);
- SymbolResolution Res = *ResITmp++;
+ assert(!ResTmp.empty());
+ const SymbolResolution &R = ResTmp.consume_front();
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
- if (Res.Prevailing)
+ if (R.Prevailing)
ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
}
}
@@ -1059,14 +1087,14 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
for (const InputFile::Symbol &Sym : Syms) {
- assert(ResI != ResE);
- SymbolResolution Res = *ResI++;
+ assert(!Res.empty());
+ const SymbolResolution &R = Res.consume_front();
if (!Sym.getIRName().empty()) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
- if (Res.Prevailing) {
+ if (R.Prevailing) {
assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
BM.getModuleIdentifier());
@@ -1074,7 +1102,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
// switch the linkage to `weak` to prevent IPOs from happening.
// Find the summary in the module for this very GV and record the new
// linkage so that we can switch it when we import the GV.
- if (Res.LinkerRedefined)
+ if (R.LinkerRedefined)
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier()))
S->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1082,7 +1110,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
// If the linker resolved the symbol to a local definition then mark it
// as local in the summary for the module we are adding.
- if (Res.FinalDefinitionInLinkageUnit) {
+ if (R.FinalDefinitionInLinkageUnit) {
if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
GUID, BM.getModuleIdentifier())) {
S->setDSOLocal(true);
@@ -1110,7 +1138,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
}
}
- return Error::success();
+ return Res;
}
unsigned LTO::getMaxTasks() const {
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 1074669..a214513 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() {
// For each entry, reserve space for 2 32-bit indices and a 64-bit count.
size_t SectionBytes =
W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
- (*CGProfileSection->begin()).appendContents(SectionBytes, 0);
+ (*CGProfileSection->begin())
+ .setVarContents(std::vector<char>(SectionBytes, 0));
}
MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() {
// (instead of emitting a zero-sized section) so these relocations are
// technically valid, even though we don't expect these relocations to
// actually be applied by the linker.
- Frag->appendContents(8, 0);
+ constexpr char zero[8] = {};
+ Frag->setVarContents(zero);
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 9c7b05b..e277143 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -57,6 +57,10 @@ void MCObjectStreamer::insert(MCFragment *F) {
newFragment();
}
+void MCObjectStreamer::appendContents(ArrayRef<char> Contents) {
+ CurFrag->appendContents(Contents);
+}
+
void MCObjectStreamer::appendContents(size_t Num, char Elt) {
CurFrag->appendContents(Num, Elt);
}
@@ -538,8 +542,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
void MCObjectStreamer::emitBytes(StringRef Data) {
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
- MCFragment *DF = getCurrentFragment();
- DF->appendContents(ArrayRef(Data.data(), Data.size()));
+ appendContents(ArrayRef(Data.data(), Data.size()));
}
void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 898ac5d..26f45ce 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -103,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
// Add a Fixup here to later record a relocation of type R_REF to prevent the
// ref symbol from being garbage collected (by the binder).
- MCFragment *DF = getCurrentFragment();
- const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
- std::optional<MCFixupKind> MaybeKind =
- getAssembler().getBackend().getFixupKind("R_REF");
- if (!MaybeKind)
- report_fatal_error("failed to get fixup kind for R_REF relocation");
-
- MCFixupKind Kind = *MaybeKind;
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind);
- DF->addFixup(Fixup);
+ addFixup(MCSymbolRefExpr::create(Symbol, getContext()),
+ XCOFF::RelocationType::R_REF);
}
void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 7b5c3c0..e87696a 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() {
}
MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
SectionKind::getMetadata());
- llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
+ llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data());
}
unsigned NumSections = Asm.end() - Asm.begin();
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
index 62a71d4..9b55f76 100644
--- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
// it is, find the target section unique id.
const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
const coff_aux_weak_external *WE = SymRef.getWeakExternal();
- if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+ if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) {
int32_t Index = SD->getNumber(IsBigObj);
if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
index 2d74d1d..5863490 100644
--- a/llvm/lib/Object/SFrameParser.cpp
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -10,27 +10,41 @@
#include "llvm/BinaryFormat/SFrame.h"
#include "llvm/Object/Error.h"
#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
using namespace llvm::object;
-template <typename T>
-static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
- uint64_t Offset) {
- static_assert(std::is_trivial_v<T>);
- if (Data.size() < Offset + sizeof(T)) {
+static Expected<ArrayRef<uint8_t>>
+getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) {
+ uint64_t End = SaturatingAdd(Offset, Size);
+ // Data.size() cannot be UINT64_MAX, as it would occupy the whole address
+ // space.
+ if (End > Data.size()) {
return createStringError(
formatv("unexpected end of data at offset {0:x} while reading [{1:x}, "
"{2:x})",
- Data.size(), Offset, Offset + sizeof(T))
+ Data.size(), Offset, End)
.str(),
object_error::unexpected_eof);
}
- return *reinterpret_cast<const T *>(Data.data() + Offset);
+ return Data.slice(Offset, Size);
+}
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+ uint64_t Offset) {
+ static_assert(std::is_trivial_v<T>);
+ Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T));
+ if (!Slice)
+ return Slice.takeError();
+
+ return *reinterpret_cast<const T *>(Slice->data());
}
template <endianness E>
-Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
+Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents,
+ uint64_t SectionAddress) {
Expected<const sframe::Preamble<E> &> Preamble =
getDataSliceAs<sframe::Preamble<E>>(Contents, 0);
if (!Preamble)
@@ -48,8 +62,44 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
getDataSliceAs<sframe::Header<E>>(Contents, 0);
if (!Header)
return Header.takeError();
- return SFrameParser(Contents, *Header);
+ return SFrameParser(Contents, SectionAddress, *Header);
+}
+
+template <endianness E>
+Expected<ArrayRef<uint8_t>> SFrameParser<E>::getAuxHeader() const {
+ return getDataSlice(Data, sizeof(Header), Header.AuxHdrLen);
+}
+
+template <endianness E>
+Expected<ArrayRef<sframe::FuncDescEntry<E>>> SFrameParser<E>::fdes() const {
+ Expected<ArrayRef<uint8_t>> Slice = getDataSlice(
+ Data, getFDEBase(), Header.NumFDEs * sizeof(sframe::FuncDescEntry<E>));
+ if (!Slice)
+ return Slice.takeError();
+ return ArrayRef(
+ reinterpret_cast<const sframe::FuncDescEntry<E> *>(Slice->data()),
+ Header.NumFDEs);
+}
+
+template <endianness E>
+uint64_t SFrameParser<E>::getAbsoluteStartAddress(
+ typename FDERange::iterator FDE) const {
+ uint64_t Result = SectionAddress + FDE->StartAddress;
+
+ if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) ==
+ sframe::Flags::FDEFuncStartPCRel) {
+ uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
+ uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+
+ assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
+ "Iterator does not belong to this object!");
+
+ Result += FDEPtr - DataPtr;
+ }
+
+ return Result;
}
-template class llvm::object::SFrameParser<endianness::big>;
-template class llvm::object::SFrameParser<endianness::little>;
+template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>;
+template class LLVM_EXPORT_TEMPLATE
+ llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f810368..90ea0c1 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -252,6 +252,7 @@
#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
#include "llvm/Transforms/Instrumentation/PGOCtxProfFlattening.h"
#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
+#include "llvm/Transforms/Instrumentation/PGOEstimateTripCounts.h"
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 98821bb..d3f741d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -80,6 +80,7 @@
#include "llvm/Transforms/Instrumentation/MemProfUse.h"
#include "llvm/Transforms/Instrumentation/PGOCtxProfFlattening.h"
#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
+#include "llvm/Transforms/Instrumentation/PGOEstimateTripCounts.h"
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Scalar/ADCE.h"
@@ -1239,6 +1240,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
MPM.addPass(AssignGUIDPass());
if (IsCtxProfUse) {
MPM.addPass(PGOCtxProfFlatteningPass(/*IsPreThinlink=*/true));
+ MPM.addPass(PGOEstimateTripCountsPass());
return MPM;
}
// Block further inlining in the instrumented ctxprof case. This avoids
@@ -1268,8 +1270,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
if (PGOOpt && (PGOOpt->Action == PGOOptions::IRUse ||
- PGOOpt->Action == PGOOptions::SampleUse))
+ PGOOpt->Action == PGOOptions::SampleUse)) {
MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
+ }
+ MPM.addPass(PGOEstimateTripCountsPass());
MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
@@ -2355,4 +2359,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
bool PassBuilder::isInstrumentedPGOUse() const {
return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
!UseCtxProfile.empty();
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index fd89583..f830b8ce 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
MODULE_PASS("globalopt", GlobalOptPass())
MODULE_PASS("globalsplit", GlobalSplitPass())
MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
+MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass())
MODULE_PASS("hipstdpar-select-accelerator-code",
HipStdParAcceleratorCodeSelectionPass())
MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
@@ -123,6 +124,7 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
MODULE_PASS("openmp-opt-postlink",
OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
MODULE_PASS("partial-inliner", PartialInlinerPass())
+MODULE_PASS("pgo-estimate-trip-counts", PGOEstimateTripCountsPass())
MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 235b134..3fc0dbf 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -135,7 +135,7 @@ readMemInfoBlocksV3(const char *Ptr) {
}
llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
-readMemInfoBlocksV4(const char *Ptr) {
+readMemInfoBlocksCommon(const char *Ptr, bool IsHistogramEncoded = false) {
using namespace support;
const uint64_t NumItemsToRead =
@@ -145,27 +145,74 @@ readMemInfoBlocksV4(const char *Ptr) {
for (uint64_t I = 0; I < NumItemsToRead; I++) {
const uint64_t Id =
endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
- // We cheat a bit here and remove the const from cast to set the
- // Histogram Pointer to newly allocated buffer.
- MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
- // Only increment by size of MIB since readNext implicitly increments.
- Ptr += sizeof(MemInfoBlock);
+ MemInfoBlock MIB;
+#define READ_MIB_FIELD(FIELD) \
+ MIB.FIELD = endian::readNext<decltype(MIB.FIELD), llvm::endianness::little, \
+ unaligned>(Ptr)
+
+ READ_MIB_FIELD(AllocCount);
+ READ_MIB_FIELD(TotalAccessCount);
+ READ_MIB_FIELD(MinAccessCount);
+ READ_MIB_FIELD(MaxAccessCount);
+ READ_MIB_FIELD(TotalSize);
+ READ_MIB_FIELD(MinSize);
+ READ_MIB_FIELD(MaxSize);
+ READ_MIB_FIELD(AllocTimestamp);
+ READ_MIB_FIELD(DeallocTimestamp);
+ READ_MIB_FIELD(TotalLifetime);
+ READ_MIB_FIELD(MinLifetime);
+ READ_MIB_FIELD(MaxLifetime);
+ READ_MIB_FIELD(AllocCpuId);
+ READ_MIB_FIELD(DeallocCpuId);
+ READ_MIB_FIELD(NumMigratedCpu);
+ READ_MIB_FIELD(NumLifetimeOverlaps);
+ READ_MIB_FIELD(NumSameAllocCpu);
+ READ_MIB_FIELD(NumSameDeallocCpu);
+ READ_MIB_FIELD(DataTypeId);
+ READ_MIB_FIELD(TotalAccessDensity);
+ READ_MIB_FIELD(MinAccessDensity);
+ READ_MIB_FIELD(MaxAccessDensity);
+ READ_MIB_FIELD(TotalLifetimeAccessDensity);
+ READ_MIB_FIELD(MinLifetimeAccessDensity);
+ READ_MIB_FIELD(MaxLifetimeAccessDensity);
+ READ_MIB_FIELD(AccessHistogramSize);
+ READ_MIB_FIELD(AccessHistogram);
+#undef READ_MIB_FIELD
if (MIB.AccessHistogramSize > 0) {
+ // The in-memory representation uses uint64_t for histogram entries.
MIB.AccessHistogram =
(uintptr_t)malloc(MIB.AccessHistogramSize * sizeof(uint64_t));
- }
-
- for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) {
- ((uint64_t *)MIB.AccessHistogram)[J] =
- endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+ for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) {
+ if (!IsHistogramEncoded) {
+ ((uint64_t *)MIB.AccessHistogram)[J] =
+ endian::readNext<uint64_t, llvm::endianness::little, unaligned>(
+ Ptr);
+ } else {
+ // The encoded on-disk format (V5 onwards) uses uint16_t.
+ const uint16_t Val =
+ endian::readNext<uint16_t, llvm::endianness::little, unaligned>(
+ Ptr);
+ ((uint64_t *)MIB.AccessHistogram)[J] = decodeHistogramCount(Val);
+ }
+ }
}
Items.push_back({Id, MIB});
}
return Items;
}
+llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+readMemInfoBlocksV4(const char *Ptr) {
+ return readMemInfoBlocksCommon(Ptr);
+}
+
+llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
+readMemInfoBlocksV5(const char *Ptr) {
+ return readMemInfoBlocksCommon(Ptr, /*IsHistogramEncoded=*/true);
+}
+
CallStackMap readStackInfo(const char *Ptr) {
using namespace support;
@@ -658,6 +705,8 @@ RawMemProfReader::readMemInfoBlocks(const char *Ptr) {
return readMemInfoBlocksV3(Ptr);
if (MemprofRawVersion == 4ULL)
return readMemInfoBlocksV4(Ptr);
+ if (MemprofRawVersion == 5ULL)
+ return readMemInfoBlocksV5(Ptr);
llvm_unreachable(
"Panic: Unsupported version number when reading MemInfoBlocks");
}
diff --git a/llvm/lib/Support/BLAKE3/CMakeLists.txt b/llvm/lib/Support/BLAKE3/CMakeLists.txt
index eae2b02..90311ae 100644
--- a/llvm/lib/Support/BLAKE3/CMakeLists.txt
+++ b/llvm/lib/Support/BLAKE3/CMakeLists.txt
@@ -26,7 +26,8 @@ endmacro()
if (CAN_USE_ASSEMBLER)
if (MSVC)
check_symbol_exists(_M_X64 "" IS_X64)
- if (IS_X64)
+ check_symbol_exists(_M_ARM64EC "" IS_ARM64EC)
+ if (IS_X64 AND NOT IS_ARM64EC)
enable_language(ASM_MASM)
set(LLVM_BLAKE3_ASM_FILES
blake3_sse2_x86-64_windows_msvc.asm
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 5bb04d0..b6f338f 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -24,11 +24,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/circular_raw_ostream.h"
#include "llvm/Support/raw_ostream.h"
+#include <utility>
#include "DebugOptions.h"
@@ -38,27 +40,62 @@
using namespace llvm;
+/// Parse a debug type string into a pair of the debug type and the debug level.
+/// The expected format is "type[:level]", where the level is an optional
+/// integer.
+static std::pair<std::string, std::optional<int>>
+parseDebugType(StringRef DbgType) {
+ std::optional<int> Level;
+ size_t ColonPos = DbgType.find(':');
+ if (ColonPos != StringRef::npos) {
+ StringRef LevelStr = DbgType.substr(ColonPos + 1);
+ DbgType = DbgType.take_front(ColonPos);
+ if (LevelStr.empty())
+ Level = 0;
+ else {
+ int parsedLevel;
+ if (to_integer(LevelStr, parsedLevel, 10))
+ Level = parsedLevel;
+ }
+ }
+ return std::make_pair(DbgType.str(), Level);
+}
+
// Even though LLVM might be built with NDEBUG, define symbols that the code
// built without NDEBUG can depend on via the llvm/Support/Debug.h header.
namespace llvm {
/// Exported boolean set by the -debug option.
bool DebugFlag = false;
-static ManagedStatic<std::vector<std::string>> CurrentDebugType;
+/// The current debug type and an optional debug level.
+/// The debug level is the verbosity of the debug output.
+/// 0 is a special level that acts as an opt-out for this specific debug type.
+/// If provided, the debug output is enabled only if the user specified a level
+/// at least as high as the provided level.
+static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>>
+ CurrentDebugType;
/// Return true if the specified string is the debug type
/// specified on the command line, or if none was specified on the command line
/// with the -debug-only=X option.
-bool isCurrentDebugType(const char *DebugType) {
+bool isCurrentDebugType(const char *DebugType, int Level) {
if (CurrentDebugType->empty())
return true;
+ // Track if there is at least one debug type with a level, this is used
+ // to allow to opt-out of some DebugType and leaving all the others enabled.
+ bool HasEnabledDebugType = false;
// See if DebugType is in list. Note: do not use find() as that forces us to
// unnecessarily create an std::string instance.
- for (auto &d : *CurrentDebugType) {
- if (d == DebugType)
+ for (auto &D : *CurrentDebugType) {
+ HasEnabledDebugType =
+ HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0);
+ if (D.first != DebugType)
+ continue;
+ if (!D.second.has_value())
return true;
+ return D.second >= Level;
}
- return false;
+ return !HasEnabledDebugType;
}
/// Set the current debug type, as if the -debug-only=X
@@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) {
void setCurrentDebugTypes(const char **Types, unsigned Count) {
CurrentDebugType->clear();
- llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count));
+ CurrentDebugType->reserve(Count);
+ for (const char *Type : ArrayRef(Types, Count))
+ CurrentDebugType->push_back(parseDebugType(Type));
}
+
} // namespace llvm
// All Debug.h functionality is a no-op in NDEBUG mode.
@@ -114,10 +154,10 @@ struct DebugOnlyOpt {
if (Val.empty())
return;
DebugFlag = true;
- SmallVector<StringRef,8> dbgTypes;
- StringRef(Val).split(dbgTypes, ',', -1, false);
- for (auto dbgType : dbgTypes)
- CurrentDebugType->push_back(std::string(dbgType));
+ SmallVector<StringRef, 8> DbgTypes;
+ StringRef(Val).split(DbgTypes, ',', -1, false);
+ for (auto DbgType : DbgTypes)
+ CurrentDebugType->push_back(parseDebugType(DbgType));
}
};
} // namespace
@@ -129,8 +169,13 @@ struct CreateDebugOnly {
static void *call() {
return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>(
"debug-only",
- cl::desc("Enable a specific type of debug output (comma separated list "
- "of types)"),
+ cl::desc(
+ "Enable a specific type of debug output (comma separated list "
+ "of types using the format \"type[:level]\", where the level "
+ "is an optional integer. The level can be set to 1, 2, 3, etc. to "
+ "control the verbosity of the output. Setting a debug-type level "
+ "to zero acts as an opt-out for this specific debug-type without "
+ "affecting the others."),
cl::Hidden, cl::value_desc("debug string"),
cl::location(DebugOnlyOptLoc), cl::ValueRequired);
}
diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index 29436f8..edb5313 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -313,5 +313,6 @@ private:
IntrusiveRefCntPtr<vfs::FileSystem>
FileCollector::createCollectorVFS(IntrusiveRefCntPtr<vfs::FileSystem> BaseFS,
std::shared_ptr<FileCollector> Collector) {
- return new FileCollectorFileSystem(std::move(BaseFS), std::move(Collector));
+ return makeIntrusiveRefCnt<FileCollectorFileSystem>(std::move(BaseFS),
+ std::move(Collector));
}
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 277247e..cc02cae 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
size_t Size = Buf.size();
#endif
ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size);
- if (ssize_t(NumRead) == -1)
+ if (NumRead == -1)
return errorCodeToError(errnoAsErrorCode());
// The underlying operation on these platforms allow opening directories
// for reading in more cases than other platforms.
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index e489282..5d42488 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -397,7 +397,8 @@ void RealFileSystem::printImpl(raw_ostream &OS, PrintType Type,
}
IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
- static IntrusiveRefCntPtr<FileSystem> FS(new RealFileSystem(true));
+ static IntrusiveRefCntPtr<FileSystem> FS =
+ makeIntrusiveRefCnt<RealFileSystem>(true);
return FS;
}
@@ -2217,9 +2218,9 @@ RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
std::unique_ptr<RedirectingFileSystem> RedirectingFileSystem::create(
ArrayRef<std::pair<std::string, std::string>> RemappedFiles,
- bool UseExternalNames, FileSystem &ExternalFS) {
+ bool UseExternalNames, llvm::IntrusiveRefCntPtr<FileSystem> ExternalFS) {
std::unique_ptr<RedirectingFileSystem> FS(
- new RedirectingFileSystem(&ExternalFS));
+ new RedirectingFileSystem(ExternalFS));
FS->UseExternalNames = UseExternalNames;
StringMap<RedirectingFileSystem::Entry *> Entries;
@@ -2228,7 +2229,7 @@ std::unique_ptr<RedirectingFileSystem> RedirectingFileSystem::create(
SmallString<128> From = StringRef(Mapping.first);
SmallString<128> To = StringRef(Mapping.second);
{
- auto EC = ExternalFS.makeAbsolute(From);
+ auto EC = ExternalFS->makeAbsolute(From);
(void)EC;
assert(!EC && "Could not make absolute path");
}
@@ -2250,7 +2251,7 @@ std::unique_ptr<RedirectingFileSystem> RedirectingFileSystem::create(
}
assert(Parent && "File without a directory?");
{
- auto EC = ExternalFS.makeAbsolute(To);
+ auto EC = ExternalFS->makeAbsolute(To);
(void)EC;
assert(!EC && "Could not make absolute path");
}
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index d862dbd..8dd7c88 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
Name.clear();
}
+namespace llvm::sys::windows {
+HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) {
+ // Ensure we load indeed a module from system32 path.
+ // As per GetModuleHandle documentation:
+ // "If lpModuleName does not include a path and there is more than one loaded
+ // module with the same base name and extension, you cannot predict which
+ // module handle will be returned.". This mitigates
+ // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637
+ SmallVector<wchar_t, MAX_PATH> Buf;
+ size_t Size = MAX_PATH;
+ do {
+ Buf.resize_for_overwrite(Size);
+ SetLastError(NO_ERROR);
+ Size = ::GetSystemDirectoryW(Buf.data(), Buf.size());
+ if (Size == 0)
+ return NULL;
+
+ // Try again with larger buffer.
+ } while (Size > Buf.size());
+
+ Buf.truncate(Size);
+ Buf.push_back(L'\\');
+ Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName));
+ Buf.push_back(0);
+
+ return ::GetModuleHandleW(Buf.data());
+}
+} // namespace llvm::sys::windows
+
SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
+ HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll");
+ if (kernelM) {
+ // SetThreadInformation is only available on Windows 8 and later. Since we
+ // still support compilation on Windows 7, we load the function dynamically.
+ typedef BOOL(WINAPI * SetThreadInformation_t)(
+ HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass,
+ _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation,
+ ULONG ThreadInformationSize);
+ static const auto pfnSetThreadInformation =
+ (SetThreadInformation_t)::GetProcAddress(kernelM,
+ "SetThreadInformation");
+ if (pfnSetThreadInformation) {
+ auto setThreadInformation = [](ULONG ControlMaskAndStateMask) {
+ THREAD_POWER_THROTTLING_STATE state{};
+ state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+ state.ControlMask = ControlMaskAndStateMask;
+ state.StateMask = ControlMaskAndStateMask;
+ return pfnSetThreadInformation(
+ ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state));
+ };
+
+ // Use EcoQoS for ThreadPriority::Background available (running on most
+ // efficent cores at the most efficient cpu frequency):
+ // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation
+ // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service
+ setThreadInformation(Priority == ThreadPriority::Background
+ ? THREAD_POWER_THROTTLING_EXECUTION_SPEED
+ : 0);
+ }
+ }
+
// https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
// Begin background processing mode. The system lowers the resource scheduling
// priorities of the thread so that it can perform background work without
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 1f3e5dc..3f318e2 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const {
}
break;
+ case GETDAGOPNAME:
+ if (const auto *Dag = dyn_cast<DagInit>(LHS)) {
+ return Dag->getName();
+ }
+ break;
+
case LOG2:
if (const auto *LHSi = dyn_cast_or_null<IntInit>(
LHS->convertInitializerTo(IntRecTy::get(RK)))) {
@@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const {
case SIZE: Result = "!size"; break;
case EMPTY: Result = "!empty"; break;
case GETDAGOP: Result = "!getdagop"; break;
+ case GETDAGOPNAME:
+ Result = "!getdagopname";
+ break;
case LOG2 : Result = "!logtwo"; break;
case LISTFLATTEN:
Result = "!listflatten";
@@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
SmallVector<std::pair<const Init *, const StringInit *>, 8> Args;
llvm::append_range(Args, LHSs->getArgAndNames());
llvm::append_range(Args, RHSs->getArgAndNames());
- return DagInit::get(Op, Args);
+ // Use the name of the LHS DAG if it's set, otherwise the name of the RHS.
+ const auto *NameInit = LHSs->getName();
+ if (!NameInit)
+ NameInit = RHSs->getName();
+ return DagInit::get(Op, NameInit, Args);
}
break;
}
@@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames());
break;
}
+ case SETDAGOPNAME: {
+ const auto *Dag = dyn_cast<DagInit>(LHS);
+ const auto *Op = dyn_cast<StringInit>(RHS);
+ if (Dag && Op)
+ return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(),
+ Dag->getArgNames());
+ break;
+ }
case ADD:
case SUB:
case MUL:
@@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const {
case STRCONCAT: Result = "!strconcat"; break;
case INTERLEAVE: Result = "!interleave"; break;
case SETDAGOP: Result = "!setdagop"; break;
+ case SETDAGOPNAME:
+ Result = "!setdagopname";
+ break;
case GETDAGARG:
Result = "!getdagarg<" + getType()->getAsString() + ">";
break;
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index aea1bb0..c369916 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() {
.Case("find", tgtok::XFind)
.Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
.Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+ .Case("setdagopname", tgtok::XSetDagOpName)
+ .Case("getdagopname", tgtok::XGetDagOpName)
.Case("getdagarg", tgtok::XGetDagArg)
.Case("getdagname", tgtok::XGetDagName)
.Case("setdagarg", tgtok::XSetDagArg)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index ed7d8f3..5725e39 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -150,6 +150,8 @@ enum TokKind {
XGt,
XSetDagOp,
XGetDagOp,
+ XSetDagOpName,
+ XGetDagOpName,
XExists,
XListRemove,
XToLower,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 62c5355..81b61b1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "TGParser.h"
+#include "TGLexer.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
@@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XCast:
case tgtok::XRepr:
case tgtok::XGetDagOp:
+ case tgtok::XGetDagOpName:
case tgtok::XInitialized: { // Value ::= !unop '(' Value ')'
UnOpInit::UnaryOp Code;
const RecTy *Type = nullptr;
@@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
Code = UnOpInit::GETDAGOP;
break;
+ case tgtok::XGetDagOpName:
+ Lex.Lex(); // eat the operation
+ Type = StringRecTy::get(Records);
+ Code = UnOpInit::GETDAGOPNAME;
+ break;
case tgtok::XInitialized:
Lex.Lex(); // eat the operation
Code = UnOpInit::INITIALIZED;
@@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XInterleave:
case tgtok::XGetDagArg:
case tgtok::XGetDagName:
- case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')'
+ case tgtok::XSetDagOp:
+ case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')'
tgtok::TokKind OpTok = Lex.getCode();
SMLoc OpLoc = Lex.getLoc();
Lex.Lex(); // eat the operation
@@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break;
case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break;
case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break;
+ case tgtok::XSetDagOpName:
+ Code = BinOpInit::SETDAGOPNAME;
+ break;
case tgtok::XGetDagArg:
Code = BinOpInit::GETDAGARG;
break;
@@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
}
ArgType = DagRecTy::get(Records);
break;
+ case tgtok::XSetDagOpName:
+ Type = DagRecTy::get(Records);
+ ArgType = DagRecTy::get(Records);
+ break;
case tgtok::XGetDagName:
Type = StringRecTy::get(Records);
ArgType = DagRecTy::get(Records);
@@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
// Deal with BinOps whose arguments have different types, by
// rewriting ArgType in between them.
switch (Code) {
- case BinOpInit::SETDAGOP:
- // After parsing the first dag argument, switch to expecting
- // a record, with no restriction on its superclasses.
- ArgType = RecordRecTy::get(Records, {});
- break;
- case BinOpInit::GETDAGARG:
- // After parsing the first dag argument, expect an index integer or a
- // name string.
- ArgType = nullptr;
- break;
- case BinOpInit::GETDAGNAME:
- // After parsing the first dag argument, expect an index integer.
- ArgType = IntRecTy::get(Records);
- break;
- default:
- break;
+ case BinOpInit::SETDAGOPNAME:
+ // After parsing the first dag argument, expect a string.
+ ArgType = StringRecTy::get(Records);
+ break;
+ case BinOpInit::SETDAGOP:
+ // After parsing the first dag argument, switch to expecting
+ // a record, with no restriction on its superclasses.
+ ArgType = RecordRecTy::get(Records, {});
+ break;
+ case BinOpInit::GETDAGARG:
+ // After parsing the first dag argument, expect an index integer or a
+ // name string.
+ ArgType = nullptr;
+ break;
+ case BinOpInit::GETDAGNAME:
+ // After parsing the first dag argument, expect an index integer.
+ ArgType = IntRecTy::get(Records);
+ break;
+ default:
+ break;
}
if (!consume(tgtok::comma))
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ca09598..99f0af5 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -39,8 +39,8 @@ let Predicates = [HasDotProd] in {
def ext_addv_to_udot_addv : GICombineRule<
(defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo),
(match (wip_match_opcode G_VECREDUCE_ADD):$root,
- [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]),
- (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }])
+ [{ return matchExtAddvToDotAddv(*${root}, MRI, STI, ${matchinfo}); }]),
+ (apply [{ applyExtAddvToDotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }])
>;
}
@@ -62,8 +62,10 @@ class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICom
def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
+def push_mul_through_zext : push_opcode_through_ext<G_MUL, G_ZEXT>;
def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;
+def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>;
def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
@@ -75,8 +77,10 @@ def AArch64PreLegalizerCombiner: GICombiner<
ext_uaddv_to_uaddlv,
push_sub_through_zext,
push_add_through_zext,
+ push_mul_through_zext,
push_sub_through_sext,
- push_add_through_sext]> {
+ push_add_through_sext,
+ push_mul_through_sext]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index eca7ca5..ad42f4b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 0,
AArch64::LDNT1B_2Z_IMM_PSEUDO,
AArch64::LDNT1B_2Z_PSEUDO);
@@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 1,
AArch64::LDNT1H_2Z_IMM_PSEUDO,
AArch64::LDNT1H_2Z_PSEUDO);
@@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 2,
AArch64::LDNT1W_2Z_IMM_PSEUDO,
AArch64::LDNT1W_2Z_PSEUDO);
@@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 3,
AArch64::LDNT1D_2Z_IMM_PSEUDO,
AArch64::LDNT1D_2Z_PSEUDO);
@@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 0,
AArch64::LDNT1B_4Z_IMM_PSEUDO,
AArch64::LDNT1B_4Z_PSEUDO);
@@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 1,
AArch64::LDNT1H_4Z_IMM_PSEUDO,
AArch64::LDNT1H_4Z_PSEUDO);
@@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 2,
AArch64::LDNT1W_4Z_IMM_PSEUDO,
AArch64::LDNT1W_4Z_PSEUDO);
@@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 3,
AArch64::LDNT1D_4Z_IMM_PSEUDO,
AArch64::LDNT1D_4Z_PSEUDO);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b49754..4f6e3dd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8952,6 +8952,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID &CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
+ const CallBase *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
MachineFunction::CallSiteInfo CSInfo;
@@ -8991,6 +8992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
*DAG.getContext());
RetCCInfo.AnalyzeCallResult(Ins, RetCC);
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
+
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
@@ -11325,7 +11330,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
SDValue AArch64TargetLowering::LowerSELECT_CC(
ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
- iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+ iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,
const SDLoc &DL, SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -11386,6 +11391,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
+ // Canonicalise absolute difference patterns:
+ // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
+ // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
+ //
+ // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
+ // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
+ // The second forms can be matched into subs+cneg.
+ if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
+ if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
+ FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS)
+ FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
+ else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
+ FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS)
+ TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
+ }
+
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
@@ -11523,7 +11544,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return true;
}
})) {
- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
SDValue VectorCmp =
emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
if (VectorCmp)
@@ -11537,7 +11558,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
- if (DAG.getTarget().Options.UnsafeFPMath) {
+ if (Flags.hasNoSignedZeros()) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
@@ -11616,10 +11637,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
+ SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
- return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
- DAG);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11627,7 +11647,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
SDLoc DL(Op);
EVT Ty = Op.getValueType();
@@ -11694,8 +11713,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
DAG.getUNDEF(MVT::f32), FVal);
}
- SDValue Res =
- LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
+ SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
+ Op->getFlags(), DL, DAG);
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -12292,7 +12311,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
- SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
+ // Ensure nodes can be recognized by isAssociativeAndCommutative.
+ SDNodeFlags Flags =
+ SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -16674,7 +16695,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath));
+ I->getFastMathFlags().allowContract()));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -24112,6 +24133,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}
+// Combine store (fp_to_int X) to use vector semantics around the conversion
+// when NEON is available. This allows us to store the in-vector result directly
+// without transferring the result into a GPR in the process.
+static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // Limit to post-legalization in order to avoid peeling truncating stores.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+ // Source operand is already a vector.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType().isVector())
+ return SDValue();
+
+ // Look through potential assertions.
+ while (Value->isAssert())
+ Value = Value.getOperand(0);
+
+ if (Value.getOpcode() != ISD::FP_TO_SINT &&
+ Value.getOpcode() != ISD::FP_TO_UINT)
+ return SDValue();
+ if (!Value->hasOneUse())
+ return SDValue();
+
+ SDValue FPSrc = Value.getOperand(0);
+ EVT SrcVT = FPSrc.getValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // No support for assignments such as i64 = fp_to_sint i32
+ EVT VT = Value.getSimpleValueType();
+ if (VT != SrcVT.changeTypeToInteger())
+ return SDValue();
+
+ // Create a 128-bit element vector to avoid widening. The floating point
+ // conversion is transformed into a single element conversion via a pattern.
+ unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
+ EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
+ EVT VecDstVT = VecSrcVT.changeTypeToInteger();
+ SDLoc DL(ST);
+ SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
+ SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
+
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
+
+ DCI.CombineTo(ST->getValue().getNode(), Extracted);
+ return SDValue(ST, 0);
+}
+
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24194,6 +24269,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);
+ if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
+ return Res;
+
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
@@ -26926,6 +27004,23 @@ static SDValue performSHLCombine(SDNode *N,
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
}
+static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
+ unsigned IntrinsicID = N->getConstantOperandVal(1);
+ auto Register =
+ (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
+ : AArch64SysReg::RNDRRS);
+ SDLoc DL(N);
+ SDValue A = DAG.getNode(
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
+ N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
+ SDValue B = DAG.getNode(
+ AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
+ return DAG.getMergeValues(
+ {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -27241,22 +27336,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
case Intrinsic::aarch64_rndr:
- case Intrinsic::aarch64_rndrrs: {
- unsigned IntrinsicID = N->getConstantOperandVal(1);
- auto Register =
- (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
- : AArch64SysReg::RNDRRS);
- SDLoc DL(N);
- SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
- N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
- SDValue B = DAG.getNode(
- AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
- return DAG.getMergeValues(
- {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
- }
+ case Intrinsic::aarch64_rndrrs:
+ return performRNDRCombine(N, DAG);
case Intrinsic::aarch64_sme_ldr_zt:
return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
DAG.getVTList(MVT::Other), N->getOperand(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 95d0e3b..ea63edd8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -662,7 +662,7 @@ private:
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
SDValue TVal, SDValue FVal,
iterator_range<SDNode::user_iterator> Users,
- bool HasNoNans, const SDLoc &dl,
+ SDNodeFlags Flags, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8685d7a0..59d4fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
// the target options or if FADD/FSUB has the contract fast-math flag.
- return Options.UnsafeFPMath ||
- Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ return Options.AllowFPOpFusion == FPOpFusion::Fast ||
Inst.getFlag(MachineInstr::FmContract);
- return true;
}
return false;
}
@@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
case AArch64::FMUL_ZZZ_H:
case AArch64::FMUL_ZZZ_S:
case AArch64::FMUL_ZZZ_D:
- return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
- (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
- Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
// == Integer types ==
// -- Base instructions --
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07cacfa..251fd44 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
+def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
+
// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index abcd550..b97d622 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -12,7 +12,7 @@
// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
-// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
//
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
template <typename T>
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+ // Strategy used to split logical immediate bitmasks.
+ enum class SplitStrategy {
+ Intersect,
+ };
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
+ bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
- if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
- return false;
-
- // If this immediate can be handled by one instruction, do not split it.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
- if (Insn.size() == 1)
- return false;
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
- unsigned OtherOpc) {
+bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy,
+ unsigned OtherOpc) {
// Try below transformation.
//
// MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
@@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
return splitTwoPartImm<T>(
MI,
- [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
- if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+ [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
+ // If this immediate is already a suitable bitmask, don't split it.
+ // TODO: Should we just combine the two instructions in this case?
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return std::nullopt;
+
+ // If this immediate can be handled by one instruction, don't split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return std::nullopt;
+
+ bool SplitSucc = false;
+ switch (Strategy) {
+ case SplitStrategy::Intersect:
+ SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ }
+ if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
@@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= visitINSERT(MI);
break;
case AArch64::ANDWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDSWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ Changed |= trySplitLogicalImm<uint32_t>(
+ AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
break;
case AArch64::ANDSXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ Changed |= trySplitLogicalImm<uint64_t>(
+ AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 40f49da..18ca22f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4905,14 +4905,17 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- // No need to unroll auto-vectorized loops
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
- return;
-
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining.
+ // inlining. Don't unroll auto-vectorized loops either, though do allow
+ // unrolling of the scalar remainder.
+ bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ // Both auto-vectorized loops and the scalar remainder have the
+ // isvectorized attribute, so differentiate between them by the presence
+ // of vector instructions.
+ if (IsVectorized && I.getType()->isVectorTy())
+ return;
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
index 0b79850..1a15075 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
@@ -50,8 +50,10 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub,
//
// %sub = G_SUB 0, %y
// %cmp = G_ICMP eq/ne, %z, %sub
+ // or with signed comparisons with the no-signed-wrap flag set
if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB ||
- !CmpInst::isEquality(Pred))
+ (!CmpInst::isEquality(Pred) &&
+ !(CmpInst::isSigned(Pred) && MaybeSub->getFlag(MachineInstr::NoSWrap))))
return false;
auto MaybeZero =
getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 1381a9b..d905692 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1810,7 +1810,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
// Couldn't optimize. Emit a compare + a Bcc.
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
- auto PredOp = ICmp.getOperand(1);
+ auto &PredOp = ICmp.getOperand(1);
emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
@@ -2506,12 +2506,12 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
return false;
}
auto &PredOp = Cmp->getOperand(1);
- auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
MIB.setInstrAndDebugLoc(I);
emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
/*RHS=*/Cmp->getOperand(3), PredOp, MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
+ const AArch64CC::CondCode InvCC =
+ changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
I.eraseFromParent();
return true;
@@ -3574,10 +3574,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return false;
}
- auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+ auto &PredOp = I.getOperand(1);
+ emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
const AArch64CC::CondCode InvCC =
changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
- emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
/*Src2=*/AArch64::WZR, InvCC, MIB);
I.eraseFromParent();
@@ -5096,11 +5097,11 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
- auto Pred =
- static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
+ auto &PredOp = CondDef->getOperand(1);
+ emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp,
+ MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
CondCode = changeICMPPredToAArch64CC(Pred);
- emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
- CondDef->getOperand(1), MIB);
} else {
// Get the condition code for the select.
auto Pred =
@@ -5148,29 +5149,37 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
+
// Given this:
//
// x = G_SUB 0, y
- // G_ICMP x, z
+ // G_ICMP z, x
//
// Produce this:
//
- // cmn y, z
- if (isCMN(LHSDef, P, MRI))
- return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
+ // cmn z, y
+ if (isCMN(RHSDef, P, MRI))
+ return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
- // Same idea here, but with the RHS of the compare instead:
+ // Same idea here, but with the LHS of the compare instead:
//
// Given this:
//
// x = G_SUB 0, y
- // G_ICMP z, x
+ // G_ICMP x, z
//
// Produce this:
//
- // cmn z, y
- if (isCMN(RHSDef, P, MRI))
- return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
+ // cmn y, z
+ //
+ // But be careful! We need to swap the predicate!
+ if (isCMN(LHSDef, P, MRI)) {
+ if (!CmpInst::isEquality(P)) {
+ P = CmpInst::getSwappedPredicate(P);
+ Predicate = MachineOperand::CreatePredicate(P);
+ }
+ return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
+ }
// Given this:
//
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index bb0f667b..e0e1af7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
};
+ auto LowerTriOp = [&MI, &MIB](unsigned Opcode) {
+ MIB.buildInstr(Opcode, {MI.getOperand(0)},
+ {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)});
+ MI.eraseFromParent();
+ return true;
+ };
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
@@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_USUBSAT);
break;
}
+ case Intrinsic::aarch64_neon_udot:
+ return LowerTriOp(AArch64::G_UDOT);
+ case Intrinsic::aarch64_neon_sdot:
+ return LowerTriOp(AArch64::G_SDOT);
case Intrinsic::vector_reverse:
// TODO: Add support for vector_reverse
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 1cd9453..8c10673 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -228,12 +228,13 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
}
-// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
-// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
+// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y))
+// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y))
+// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1))
// Similar to performVecReduceAddCombine in SelectionDAG
-bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
- const AArch64Subtarget &STI,
- std::tuple<Register, Register, bool> &MatchInfo) {
+bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+ const AArch64Subtarget &STI,
+ std::tuple<Register, Register, bool> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
"Expected a G_VECREDUCE_ADD instruction");
assert(STI.hasDotProd() && "Target should have Dot Product feature");
@@ -246,31 +247,57 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
return false;
- LLT SrcTy;
- auto I1Opc = I1->getOpcode();
- if (I1Opc == TargetOpcode::G_MUL) {
+ // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT
+ // then the ext's must match the same opcode. It is set to the ext opcode on
+ // output.
+ auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1,
+ Register &Out2, unsigned &I1Opc) {
// If result of this has more than 1 use, then there is no point in creating
- // udot instruction
- if (!MRI.hasOneNonDBGUse(MidReg))
+ // a dot instruction
+ if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
return false;
MachineInstr *ExtMI1 =
- getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
+ getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
MachineInstr *ExtMI2 =
- getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
+ getDefIgnoringCopies(MI->getOperand(2).getReg(), MRI);
LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
return false;
+ if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) &&
+ I1Opc != ExtMI1->getOpcode())
+ return false;
+ Out1 = ExtMI1->getOperand(1).getReg();
+ Out2 = ExtMI2->getOperand(1).getReg();
I1Opc = ExtMI1->getOpcode();
- SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
- std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
- std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
+ return true;
+ };
+
+ LLT SrcTy;
+ unsigned I1Opc = I1->getOpcode();
+ if (I1Opc == TargetOpcode::G_MUL) {
+ Register Out1, Out2;
+ if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc))
+ return false;
+ SrcTy = MRI.getType(Out1);
+ std::get<0>(MatchInfo) = Out1;
+ std::get<1>(MatchInfo) = Out2;
} else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) {
- SrcTy = MRI.getType(I1->getOperand(1).getReg());
- std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
- std::get<1>(MatchInfo) = 0;
+ Register I1Op = I1->getOperand(1).getReg();
+ MachineInstr *M = getDefIgnoringCopies(I1Op, MRI);
+ Register Out1, Out2;
+ if (M->getOpcode() == TargetOpcode::G_MUL &&
+ tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) {
+ SrcTy = MRI.getType(Out1);
+ std::get<0>(MatchInfo) = Out1;
+ std::get<1>(MatchInfo) = Out2;
+ } else {
+ SrcTy = MRI.getType(I1Op);
+ std::get<0>(MatchInfo) = I1Op;
+ std::get<1>(MatchInfo) = 0;
+ }
} else {
return false;
}
@@ -288,11 +315,11 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
return true;
}
-void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &Builder,
- GISelChangeObserver &Observer,
- const AArch64Subtarget &STI,
- std::tuple<Register, Register, bool> &MatchInfo) {
+void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder,
+ GISelChangeObserver &Observer,
+ const AArch64Subtarget &STI,
+ std::tuple<Register, Register, bool> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
"Expected a G_VECREDUCE_ADD instruction");
assert(STI.hasDotProd() && "Target should have Dot Product feature");
@@ -553,15 +580,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
-// Pushes ADD/SUB through extend instructions to decrease the number of extend
-// instruction at the end by allowing selection of {s|u}addl sooner
-
+// Pushes ADD/SUB/MUL through extend instructions to decrease the number of
+// extend instruction at the end by allowing selection of {s|u}addl sooner
// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
Register DstReg, Register SrcReg1, Register SrcReg2) {
assert((MI.getOpcode() == TargetOpcode::G_ADD ||
- MI.getOpcode() == TargetOpcode::G_SUB) &&
- "Expected a G_ADD or G_SUB instruction\n");
+ MI.getOpcode() == TargetOpcode::G_SUB ||
+ MI.getOpcode() == TargetOpcode::G_MUL) &&
+ "Expected a G_ADD, G_SUB or G_MUL instruction\n");
// Deal with vector types only
LLT DstTy = MRI.getType(DstReg);
@@ -594,9 +621,10 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
// G_SUB has to sign-extend the result.
- // G_ADD needs to sext from sext and can sext or zext from zext, so the
- // original opcode is used.
- if (MI.getOpcode() == TargetOpcode::G_ADD)
+ // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL
+ // needs to use the original opcode so the original opcode is used for both.
+ if (MI.getOpcode() == TargetOpcode::G_ADD ||
+ MI.getOpcode() == TargetOpcode::G_MUL)
B.buildInstr(Opc, {DstReg}, {AddReg});
else
B.buildSExt(DstReg, AddReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8b8fc8b..8a0c4ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
"VMEM CU scope prefetches do not fail on illegal address"
>;
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+ "HasCUStores",
+ "true",
+ "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -1383,6 +1389,9 @@ def FeatureAddSubU64Insts
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
"Has v_add_u64 and v_sub_u64 instructions">;
+def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
+ "true", "Has v_mad_u32 instruction">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1988,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureCUStores,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
@@ -2042,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
+ FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2422,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts :
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
@@ -2565,6 +2576,10 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+def HasAddMinMaxInsts :
+ Predicate<"Subtarget->hasAddMinMaxInsts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
@@ -2832,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
+ AssemblerPredicate<(all_of FeatureMadU32Inst)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc37..6681393 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ if (ST.isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (isGFX1250(ST) && ST.hasCUStores()) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+ }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b44..59cc1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,7 +13,6 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 5f19837..a9278c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
-static bool hasUnsafeFPMath(const Function &F) {
- return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
-}
-
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -104,7 +100,6 @@ public:
const DominatorTree *DT;
const UniformityInfo &UA;
const DataLayout &DL;
- const bool HasUnsafeFPMath;
const bool HasFP32DenormalFlush;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
@@ -117,7 +112,6 @@ public:
const DominatorTree *DT, const UniformityInfo &UA)
: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
DT(DT), UA(UA), DL(F.getDataLayout()),
- HasUnsafeFPMath(hasUnsafeFPMath(F)),
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
DenormalMode::getPreserveSign()) {}
@@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
return false;
// v_rsq_f32 gives 1ulp
- return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
- SqrtOp->getFPAccuracy() >= 1.0f;
+ return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
IRBuilder<>::FastMathFlagGuard Guard(Builder);
Builder.setFastMathFlags(DivFMF | SqrtFMF);
- if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
canIgnoreDenormalInput(Den, CtxI)) {
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
// -1.0 / sqrt(x) -> fneg(rsq(x))
@@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
-// allowed with unsafe-fp-math or afn.
+// allowed with afn.
//
// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
Value *
@@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
//
// With rcp:
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
-// allowed with unsafe-fp-math or afn.
+// allowed with afn.
//
-// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
//
// With fdiv.fast:
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
@@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
RsqOp = SqrtOp->getOperand(0);
}
- // Inaccurate rcp is allowed with unsafe-fp-math or afn.
+ // Inaccurate rcp is allowed with afn.
//
// Defer to codegen to handle this.
//
@@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
// expansion of afn to codegen. The current interpretation is so aggressive we
// don't need any pre-consideration here when we have better information. A
// more conservative interpretation could use handling here.
- const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
+ const bool AllowInaccurateRcp = DivFMF.approxFunc();
if (!RsqOp && AllowInaccurateRcp)
return false;
@@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
// We're trying to handle the fast-but-not-that-fast case only. The lowering
// of fast llvm.sqrt will give the raw instruction anyway.
- if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
+ if (SqrtFMF.approxFunc())
return false;
const float ReqdAccuracy = FPOp->getFPAccuracy();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index c01e5d3..992572f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
+def gi_global_saddr_no_ioffset :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 2991778..19b8757 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -204,7 +204,7 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
for (auto &Op : Node->operands())
Dims.push_back(Dims.getDocument()->getNode(
- uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue())));
+ mdconst::extract<ConstantInt>(Op)->getZExtValue()));
return Dims;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index dfaa145..39b4200 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
unsigned Opc;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
if (Subtarget->hasMADIntraFwdBug())
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
else
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
Clamp };
+
+ if (UseNoCarry) {
+ MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
+ ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
@@ -2049,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5636d89..983f1aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -174,6 +174,8 @@ private:
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f25ce87..31c4f62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2634,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
if (Flags.hasApproximateFuncs())
return true;
auto &Options = DAG.getTarget().Options;
- return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+ return Options.ApproxFuncFPMath;
}
bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
@@ -2757,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
const auto &Options = getTargetMachine().Options;
if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
- Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
+ Options.ApproxFuncFPMath) {
if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
// Log and multiply in f32 is good enough for f16.
@@ -3585,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
if (N0.getValueType() == MVT::f32)
return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
- if (getTargetMachine().Options.UnsafeFPMath) {
+ if (Op->getFlags().hasApproximateFuncs()) {
// There is a generic expand for FP_TO_FP16 with unsafe fast math.
return SDValue();
}
@@ -4846,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
-// Detect when CMP and SELECT use the same constant and fold them to avoid
-// loading the constant twice. Specifically handles patterns like:
-// %cmp = icmp eq i32 %val, 4242
-// %sel = select i1 %cmp, i32 4242, i32 %other
-// It can be optimized to reuse %val instead of 4242 in select.
-static SDValue
-foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const AMDGPUSubtarget *ST) {
- SDValue Cond = N->getOperand(0);
- SDValue TrueVal = N->getOperand(1);
- SDValue FalseVal = N->getOperand(2);
-
- // Check if condition is a comparison.
- if (Cond.getOpcode() != ISD::SETCC)
- return SDValue();
-
- SDValue LHS = Cond.getOperand(0);
- SDValue RHS = Cond.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
- bool isInteger = LHS.getValueType().isInteger();
-
- // Handle simple floating-point and integer types only.
- if (!isFloatingPoint && !isInteger)
- return SDValue();
-
- bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
- bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
- if (!isEquality && !isNonEquality)
- return SDValue();
-
- SDValue ArgVal, ConstVal;
- if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
- (isInteger && isa<ConstantSDNode>(RHS))) {
- ConstVal = RHS;
- ArgVal = LHS;
- } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
- (isInteger && isa<ConstantSDNode>(LHS))) {
- ConstVal = LHS;
- ArgVal = RHS;
- } else {
- return SDValue();
- }
-
- // Check if constant should not be optimized - early return if not.
- if (isFloatingPoint) {
- const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
- const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
-
- // Only optimize normal floating-point values (finite, non-zero, and
- // non-subnormal as per IEEE 754), skip optimization for inlinable
- // floating-point constants.
- if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
- return SDValue();
- } else {
- int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
-
- // Skip optimization for inlinable integer immediates.
- // Inlinable immediates include: -16 to 64 (inclusive).
- if (IntVal >= -16 && IntVal <= 64)
- return SDValue();
- }
-
- // For equality and non-equality comparisons, patterns:
- // select (setcc x, const), const, y -> select (setcc x, const), x, y
- // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
- if (!(isEquality && TrueVal == ConstVal) &&
- !(isNonEquality && FalseVal == ConstVal))
- return SDValue();
-
- SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
- SDValue SelectRHS =
- (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
- return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
- SelectLHS, SelectRHS);
-}
-
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
- // Try to fold CMP + SELECT patterns with shared constants (both FP and
- // integer).
- if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
- return Folded;
-
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 266dee1..b0d3b12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
+ MRI->use_nodbg_empty(I.getOperand(1).getReg());
unsigned Opc;
if (Subtarget->hasMADIntraFwdBug())
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
: AMDGPU::V_MAD_I64_I32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
+ : AMDGPU::V_MAD_NC_I64_I32_e64;
else
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+
+ if (UseNoCarry)
+ I.removeOperand(1);
+
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
@@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
}
unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+ if (!IsB32 && STI.hasTrue16BitInsts())
+ Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
+ : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
unsigned CBL = STI.getConstantBusLimit(Opc);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -5789,6 +5801,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
@@ -6971,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
: (int64_t)SISrcMods::DST_OP_SEL);
}
@@ -6986,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
? (int64_t)(SISrcMods::OP_SEL_0)
: 0);
}
@@ -7021,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ ? (int64_t)SISrcMods::DST_OP_SEL
+ : 0);
}
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index fe9743d0a..140e753 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -264,6 +264,8 @@ private:
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7a50923..511fc69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode()
def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
-def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}
def FMA : Predicate<"Subtarget->hasFMA()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f..1fdf272 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
- .widenScalarToNextPow2(0)
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder(G_ABS)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ if (ST.hasIntMinMax64()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, S64, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ }
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
@@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
- if (ST.hasGFX90AInsts()) {
+ if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
@@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
LLT::scalar(32), commonAlignment(Align(64), Offset));
// Pointer address
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
- B.buildPtrAdd(LoadAddr, QueuePtr,
- B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
+ B.buildObjectPtrOffset(
+ LoadAddr, QueuePtr,
+ B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -3326,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
if (Flags & MachineInstr::FmAfn)
return true;
const auto &Options = MF.getTarget().Options;
- return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+ return Options.ApproxFuncFPMath;
}
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
@@ -3432,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
- TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
+ TM.Options.ApproxFuncFPMath) {
if (Ty == F16 && !ST.has16BitInsts()) {
Register LogVal = MRI.createGenericVirtualRegister(F32);
auto PromoteSrc = B.buildFPExt(F32, X);
@@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
llvm_unreachable("failed to find kernarg segment ptr");
auto COffset = B.buildConstant(LLT::scalar(64), Offset);
- // TODO: Should get nuw
- return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
+ return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
}
/// Legalize a value that's loaded from kernel arguments. This is only used by
@@ -4860,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);
- const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
- MF.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
@@ -4922,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);
- const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
- MI.getFlag(MachineInstr::FmAfn);
+ bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
if (!AllowInaccurateRcp)
return false;
@@ -5676,8 +5689,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
return false;
- // FIXME: This should be nuw
- B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ B.buildObjectPtrOffset(DstReg, KernargPtrReg,
+ B.buildConstant(IdxTy, Offset).getReg(0));
return true;
}
@@ -7019,8 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
// Pointer address
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
B.buildCopy(SGPR01, Temp);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 8767208..aa75534 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -53,8 +53,6 @@ private:
using FuncInfo = llvm::AMDGPULibFunc;
- bool UnsafeFPMath = false;
-
// -fuse-native.
bool AllNative = false;
@@ -117,7 +115,6 @@ private:
bool AllowStrictFP = false);
protected:
- bool isUnsafeMath(const FPMathOperator *FPOp) const;
bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
@@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
return AMDGPULibFunc::parse(FMangledName, FInfo);
}
-bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
- return UnsafeFPMath || FPOp->isFast();
-}
-
bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
- return UnsafeFPMath ||
- (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
+ return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
}
bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
const FPMathOperator *FPOp) const {
// TODO: Refine to approxFunc or contract
- return isUnsafeMath(FPOp);
+ return FPOp->isFast();
}
void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
- UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
AC = &FAM.getResult<AssumptionAnalysis>(F);
TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index f471881..b45627d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
BasePlusOffset = Base;
} else {
auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
- BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+ BasePlusOffset =
+ B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
}
auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c5a1d9e..6bca2fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX:
case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
@@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
+ if (isSALUMapping(MI)) {
+ // There are no scalar 64-bit min and max, use vector instruction instead.
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&
+ Subtarget.hasIntMinMax64())
+ return getDefaultMappingVOP(MI);
+ return getDefaultMappingSOP(MI);
+ }
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
@@ -4566,8 +4574,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16:
+ case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_cvt_pk_fp8_f16:
+ case Intrinsic::amdgcn_cvt_pk_bf8_f16:
+ case Intrinsic::amdgcn_cvt_sr_fp8_f16:
+ case Intrinsic::amdgcn_cvt_sr_bf8_f16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
@@ -5364,6 +5377,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c865082..c1f1703 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -104,7 +104,9 @@
#include "llvm/Transforms/Scalar/FlattenCFG.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/LICM.h"
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/Transforms/Scalar/Sink.h"
@@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}
@@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
@@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
// TODO: May want to move later or split into an early and late one.
addPass(AMDGPUCodeGenPreparePass(TM));
- // TODO: LICM
+ // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
+ // have expanded.
+ if (TM.getOptLevel() > CodeGenOptLevel::Less) {
+ addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
+ /*UseMemorySSA=*/true));
+ }
}
Base::addIRPasses(addPass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 24f4df2..a0c99b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
// Estimate all types may be fused with contract/unsafe flags
const TargetOptions &Options = TLI->getTargetMachine().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath ||
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
return TargetTransformInfo::TCC_Free;
}
@@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
return LT.first * Cost * NElts;
}
- if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
- TLI->getTargetMachine().Options.UnsafeFPMath)) {
+ if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
// Fast unsafe fdiv lowering:
// f32 rcp
// f32 fmul
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc42..a4ea8cf 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -689,6 +689,8 @@ public:
bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
+ bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
+
bool isVISrcB32() const {
return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
}
@@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
case AMDGPU::OPERAND_REG_IMM_BF16:
@@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
@@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
setImmKindConst();
return;
}
+ [[fallthrough]];
+
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
setImmKindLiteral();
@@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16)
return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm());
+ if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16)
+ return false;
+
llvm_unreachable("invalid operand type");
}
default:
@@ -6066,6 +6076,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
+ } else if (ID == ".amdhsa_uses_cu_stores") {
+ if (!isGFX1250())
+ return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
@@ -9415,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) {
Inst.addOperand(Inst.getOperand(0));
}
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f99e716..1956a15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
}
//===----------------------------------------------------------------------===//
-// MUBUF - GFX11, GFX12.
+// MUBUF - GFX11, GFX12, GFX1250.
//===----------------------------------------------------------------------===//
// gfx11 instruction that accept both old and new assembler name.
@@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op,
def : Mnem_gfx12<gfx11_name, gfx12_name>;
}
+multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>,
+ MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> {
+ def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>;
+}
+
defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>;
defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>;
@@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
+defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>;
+defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">;
+defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">;
+
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 42edec0..c466f9c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen
Instrumentation
MC
MIRParser
+ ObjCARC
Passes
Scalar
SelectionDAG
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 319cc9d..3ff675d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
+defm DS_ADD_F64 : DS_Real_gfx12<0x054>;
+defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>;
+
let AssemblerPredicate = HasLdsBarrierArriveAtomic in {
defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>;
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b..ffe6b06 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (isGFX1250())
+ PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+ KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7207c25..d5d1074 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,6 +11,7 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
+ def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
@@ -369,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
}
}
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+// Async loads, introduced in gfx1250, will store directly
+// to a DS address in vdst (they will not use M0 for DS addess).
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
opName,
(outs ),
!con(
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
- (ins flat_offset:$offset, CPol_0:$cpol)),
- " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
- let LGKM_CNT = 1;
+ !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let LGKM_CNT = !not(IsAsync);
+ let VM_CNT = !not(IsAsync);
+ let ASYNC_CNT = IsAsync;
let is_flat_global = 1;
let lds = 1;
let has_data = 0;
+ let has_vdst = IsAsync; // vdst for ds address with IsAsync
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let VALU = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
+ let Defs = !if(IsAsync, [ASYNCcnt], []);
+ let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+ GlobalSaddrTable<1, opName>;
+}
+
+class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+ opName,
+ (outs ),
+ !con(
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let VM_CNT = 0;
+ let ASYNC_CNT = 1;
+ let is_flat_global = 1;
+ let lds = 1;
+ let has_data = 1; // vdata for ds address
let has_vdst = 0;
let mayLoad = 1;
let mayStore = 1;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
let VALU = 1;
- let Uses = [M0, EXEC];
+ let Uses = [EXEC, ASYNCcnt];
+ let Defs = [ASYNCcnt];
let SchedRW = [WriteVMEM, WriteLDS];
}
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
- def "" : FLAT_Global_Load_LDS_Pseudo<opName>,
+multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
+ def "" : FLAT_Global_STORE_LDS_Pseudo<opName>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+ def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>,
GlobalSaddrTable<1, opName>;
}
@@ -1156,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in {
let SubtargetPredicate = isGFX1250Plus in {
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
+
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
} // End SubtargetPredicate = isGFX1250Plus
@@ -1315,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
+class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $vaddr, $dsaddr, $offset, $cpol)
+>;
+
+class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
@@ -1525,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatStoreLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
@@ -2091,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly
+let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
+
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -3374,12 +3473,29 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
+defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
+
+defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">;
+defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">;
+
def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 785ede3..bdd900d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -248,6 +248,7 @@ protected:
bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
bool HasSafeCUPrefetch = false;
+ bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -272,6 +273,7 @@ protected:
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
+ bool HasMadU32Inst = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -714,7 +716,9 @@ public:
bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
// DS_ADD_F64/DS_ADD_RTN_F64
- bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
+ bool hasLdsAtomicAddF64() const {
+ return hasGFX90AInsts() || hasGFX1250Insts();
+ }
bool hasMultiDwordFlatScratchAddressing() const {
return getGeneration() >= GFX9;
@@ -998,6 +1002,8 @@ public:
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+ bool hasCUStores() const { return HasCUStores; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1516,9 +1522,22 @@ public:
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+ // \returns true if the target has V_MAD_U32 instruction.
+ bool hasMadU32Inst() const { return HasMadU32Inst; }
+
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
bool hasVectorMulU64() const { return GFX1250Insts; }
+ // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
+ // instructions.
+ bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+ bool hasIntMinMax64() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
+ bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 11b072e..15088ac 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O))
return;
break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
default:
llvm_unreachable("bad operand type");
}
@@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index c49ad79..f358084 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
.value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return 255;
+
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
case AMDGPU::OPERAND_KIMM64:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d33..43ca548 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
+ if (isGFX1250(STI))
+ PrintField(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+ ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 40b8bcd..c564145 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -208,6 +208,7 @@ enum OperandType : unsigned {
OPERAND_REG_IMM_V2BF16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_NOINLINE_V2FP16,
OPERAND_REG_IMM_V2INT32,
OPERAND_REG_IMM_V2FP32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index b77da4d..e934152 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8d51ec6..ad26757 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
+ if (Subtarget->hasIntMinMax64())
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,
+ Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1256,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+static unsigned getIntrMemWidth(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ return 8;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ return 32;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ return 64;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ return 128;
+ default:
+ llvm_unreachable("Unknown width");
+ }
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1527,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
@@ -1623,10 +1666,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -4241,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
if (Alignment > StackAlign) {
- uint64_t ScaledAlignment = (uint64_t)Alignment.value()
+ uint64_t ScaledAlignment = Alignment.value()
<< Subtarget->getWavefrontSizeLog2();
uint64_t StackAlignMask = ScaledAlignment - 1;
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
@@ -7148,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}
- if (getTargetMachine().Options.UnsafeFPMath) {
+ if (Op->getFlags().hasApproximateFuncs()) {
SDValue Flags = Op.getOperand(1);
SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
@@ -11243,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateRcp =
- Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
// Without !fpmath accuracy information, we can't do more because we don't
@@ -11263,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// 1.0 / sqrt(x) -> rsq(x)
- // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+ // XXX - Is afn sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
// 1.0 / x -> rcp(x)
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
@@ -11297,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateDiv =
- Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
if (!AllowInaccurateDiv)
return SDValue();
@@ -14550,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
- if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N0->getFlags().hasAllowContract() &&
N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
@@ -15673,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
// regardless of the denorm mode setting. Therefore,
- // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
+ // fp-contract is sufficient to allow generating fdot2.
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N->getFlags().hasAllowContract() &&
FMA->getFlags().hasAllowContract())) {
Op1 = Op1.getOperand(0);
@@ -15896,6 +15945,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+SDValue SITargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ // Detect when CMP and SELECT use the same constant and fold them to avoid
+ // loading the constant twice. Specifically handles patterns like:
+ // %cmp = icmp eq i32 %val, 4242
+ // %sel = select i1 %cmp, i32 4242, i32 %other
+ // It can be optimized to reuse %val instead of 4242 in select.
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Skip optimization for inlinable immediates.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ if (AMDGPU::isInlinableIntLiteral(
+ cast<ConstantSDNode>(ConstVal)->getSExtValue()))
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -15944,6 +16065,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMulCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
+ case ISD::SELECT:
+ if (auto Res = performSelectCombine(N, DCI))
+ return Res;
+ break;
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index acf6158..dedd9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -211,6 +211,7 @@ private:
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 520c321..4b48fc4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_lds_direct ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2aa6b4e..044a681 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return AMDGPU::isInlinableLiteralV2BF16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return false;
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
@@ -9281,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
default:
if (MI.isMetaInstruction())
return 0;
+
+ // If D16 Pseudo inst, get correct MC code size
+ const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
+ if (D16Info) {
+ // Assume d16_lo/hi inst are always in same size
+ unsigned LoInstOpcode = D16Info->LoOp;
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
+ DescSize = Desc.getSize();
+ }
+
return DescSize;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 83b0490..efcc88e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2859,6 +2859,7 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
+def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
@@ -2926,6 +2927,7 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
+def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 607825e..53f554e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,8 +321,7 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
- bool Atomic) const {
+ virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -603,8 +602,7 @@ public:
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool finalizeStore(MachineBasicBlock::iterator &MI,
- bool Atomic) const override;
+ bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -1172,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2080,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2538,9 +2556,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
- if (Op == SIMemOp::STORE)
- Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2553,9 +2568,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
- bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
if (!CPol)
return false;
@@ -2570,7 +2584,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
- if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
+ // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+ if (Scope == CPol::SCOPE_CU &&
+ (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
return false;
@@ -2674,6 +2690,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ // FIXME: Necessary hack because iterator can lose track of the store.
+ MachineInstr &StoreMI = *MI;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2690,7 +2708,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
- Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}
@@ -2703,7 +2721,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 218841d..36d1a3b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1218,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> {
def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">;
def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">;
+def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">;
+
//===----------------------------------------------------------------------===//
// VRegSrc_* Operands with a VGPR
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc..8303410 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b5b3cc9..5827f18 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) {
}
bool isAsyncStore(unsigned Opc) {
- return false; // placeholder before async store implementation.
+ return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250;
}
bool isTensorStore(unsigned Opc) {
@@ -2652,6 +2659,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
@@ -3016,6 +3024,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return isInlinableLiteralV2BF16(Literal);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return false;
default:
llvm_unreachable("bad packed operand type");
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c09a9d6..74d59f4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1636,6 +1636,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
return 2;
default:
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 550ec9d..9de7d6d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">;
} // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit]
let SubtargetPredicate = HasPkFmacF16Inst in {
+// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection.
+// If this changes, ensure the DPP variant is not used for GFX11+.
defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
} // End SubtargetPredicate = HasPkFmacF16Inst
@@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
- VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;
+ VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>;
multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index b6f9568..96fe503 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let HasExtDPP = 0;
}
-let HasExt64BitDPP = 1 in {
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
@@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
+let HasExt64BitDPP = 1 in {
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+
class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
@@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
-
-def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> {
let HasExtVOP3DPP = 0;
- let HasExtDPP = 0;
+ let HasExt64BitDPP = 1;
+}
+def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>;
+def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> {
+ let HasClamp = 1;
}
} // End HasExt64BitDPP = 1;
@@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SchedRW = [WriteIntMul] in {
+ let SubtargetPredicate = HasMadU32Inst in
+ defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>;
+ let SubtargetPredicate = isGFX1250Plus in {
+ defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>;
+ defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>;
+ }
+}
+
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
@@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
+defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>;
+defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>;
+defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>;
+defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>;
+} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd]
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -722,6 +746,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+}
+
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
@@ -848,6 +879,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in
+ def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>;
+
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
@@ -858,6 +892,13 @@ def : GCNPat<
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;
+let SubtargetPredicate = HasAddMinMaxInsts in {
+def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+}
+
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
@@ -972,10 +1013,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
unsigned Val = N->getZExtValue();
unsigned New = 0;
if (}] # modifier_idx # [{ == 0) {
- New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
- : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
- } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
- New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
+ : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
+ } else if (}] # modifier_idx # [{== 1) {
+ New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ } if (}] # modifier_idx # [{== 2) {
+ New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
}
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
@@ -1427,34 +1470,72 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
-let SubtargetPredicate = HasBitOp3Insts in {
+let HasClamp = 0, HasModifiers = 1 in {
+def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
+def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
+def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>;
+}
+
+let OtherPredicates = [HasBitOp3Insts] in {
let isReMaterializable = 1 in {
- defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
- VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>;
+ let SubtargetPredicate = isGFX940Plus in
+ defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>;
+ let SubtargetPredicate = isGFX1250Plus in
+ defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile,
+ BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>;
defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>,
VOPD_Component<0x12, "v_bitop2_b32">;
}
+
def : GCNPat<
(i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
def : GCNPat<
- (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-
- def : GCNPat<
(i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
- def : GCNPat<
- (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-} // End SubtargetPredicate = HasBitOp3Insts
+ let SubtargetPredicate = isGFX940Plus in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ } // End SubtargetPredicate = isGFX940Plus
+
+ let SubtargetPredicate = isGFX1250Plus in {
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ } // End SubtargetPredicate = isGFX1250Plus
+
+} // End OtherPredicates = [HasBitOp3Insts]
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
(AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
@@ -1531,6 +1612,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true;
let SubtargetPredicate = HasBF16ConversionInsts in {
let ReadsModeReg = 0 in {
defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>;
+ defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>;
}
def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
(V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
@@ -1541,6 +1623,53 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>;
}
+let Src0RC64 = VSrc_NoInline_v2f16 in {
+def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
+def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
+def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>;
+}
+
+let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in {
+ defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250",
+ VOP3_CVT_PK_F8_F16_Profile,
+ VOP3_CVT_PK_F8_F16_True16_Profile,
+ VOP3_CVT_PK_F8_F16_Fake16_Profile,
+ int_amdgcn_cvt_pk_fp8_f16>;
+ defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250",
+ VOP3_CVT_PK_F8_F16_Profile,
+ VOP3_CVT_PK_F8_F16_True16_Profile,
+ VOP3_CVT_PK_F8_F16_Fake16_Profile,
+ int_amdgcn_cvt_pk_bf8_f16>;
+}
+
+let HasClamp = 0, HasOpSel = 1 in {
+def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>;
+def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>;
+def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>;
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+ let ReadsModeReg = 0 in {
+ // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel
+ // to select a byte in the vdst. Bits 0 and 1 are unused.
+ let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile,
+ VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
+ defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile,
+ VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
+ }
+ } // End ReadsModeReg = 0
+
+ let True16Predicate = UseRealTrue16Insts in {
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
+ }
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>;
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>;
+ }
+} // End SubtargetPredicate = isGFX1250Plus
+
class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
@@ -1746,6 +1875,21 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
+defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">;
+defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>;
+
+defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
+defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
+defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
+defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>;
+defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>;
+defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>;
+defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>;
+defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>;
+defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
+defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
+defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
+
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
@@ -1918,7 +2062,15 @@ let AssemblerPredicate = isGFX11Plus in {
// These instructions differ from GFX12 variant by supporting DPP:
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
+defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
+defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
+defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
+defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
+defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">;
+defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">;
+defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>;
+defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c21e2d3..a029376 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0);
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
@@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
let Inst{14 - 8} = sdst;
}
+class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName>
+ : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
+class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName>
+ : Base_VOP3_DPP8_t16<op, p, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: Base_VOP3_DPP8<op, ps, opName> {
bits<8> sdst;
@@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple<
multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
VOP3be_Realtriple<Gen, op, 1>;
+multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> {
+ def _e64_dpp#Gen.Suffix :
+ VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>;
+}
+
+multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+ def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> {
+ let DecoderNamespace =
+ Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ }
+}
+
+multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3a_BITOP3_gfx12<op, ps.Pfl>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// VOP3 GFX11
//===----------------------------------------------------------------------===//
@@ -2004,6 +2071,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> :
multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> :
VOP3_Realtriple<GFX1250Gen, op, isSingle>;
+multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName,
+ string asmName, string pseudo_mnemonic = "",
+ bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>;
+
multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
@@ -2024,6 +2100,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName,
defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>;
}
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op,
+ string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ string opName = NAME, string pseudo_mnemonic = ""> {
+ defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>;
+ defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>;
+}
+
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
string asmName, bit isSingle = 0> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -2046,6 +2129,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+ VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>;
+
+multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> {
+ defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>;
+ defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>;
+}
+
multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0,
string opName = NAME> :
VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 50217c3..9e4dbec 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl(
// instructions).
if (Latency > 0 && Subtarget.isThumb2()) {
const MachineFunction *MF = DefMI.getParent()->getParent();
- // FIXME: Use Function::hasOptSize().
- if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize))
+ if (MF->getFunction().hasOptSize())
--Latency;
}
return Latency;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 066b392..bd4b75f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2423,6 +2423,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallingConv::ID CallConv = CLI.CallConv;
bool doesNotRet = CLI.DoesNotReturn;
bool isVarArg = CLI.IsVarArg;
+ const CallBase *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2446,6 +2447,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
!Subtarget->noBTIAtReturnTwice())
GuardWithBTI = AFI->branchTargetEnforcement();
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
+
// Determine whether this is a non-secure function call.
if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
isCmseNSCall = true;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 868556b..6dfe846 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
// Add the R_ARM_NONE fixup at the same position
void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+ visitUsedSymbol(*PersonalitySym);
const MCSymbolRefExpr *PersonalityRef =
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
-
- visitUsedExpr(*PersonalityRef);
- MCFragment *DF = getCurrentFragment();
- DF->addFixup(
- MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
+ addFixup(PersonalityRef, FK_Data_4);
}
void ARMELFStreamer::FlushPendingOffset() {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index a87b9a2..bed6bc9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
return;
}
- // MapDef type may be a struct type or a non-pointer derived type
- const DIType *OrigTy = Ty;
- while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
- auto Tag = DTy->getTag();
- if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
- Tag != dwarf::DW_TAG_volatile_type &&
- Tag != dwarf::DW_TAG_restrict_type)
- break;
- Ty = DTy->getBaseType();
- }
-
- const auto *CTy = dyn_cast<DICompositeType>(Ty);
- if (!CTy)
- return;
-
- auto Tag = CTy->getTag();
- if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
- return;
-
- // Visit all struct members to ensure their types are visited.
- const DINodeArray Elements = CTy->getElements();
- for (const auto *Element : Elements) {
- const auto *MemberType = cast<DIDerivedType>(Element);
- const DIType *MemberBaseType = MemberType->getBaseType();
-
- // If the member is a composite type, that may indicate the currently
- // visited composite type is a wrapper, and the member represents the
- // actual map definition.
- // In that case, visit the member with `visitMapDefType` instead of
- // `visitTypeEntry`, treating it specifically as a map definition rather
- // than as a regular composite type.
- const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
- if (MemberCTy) {
- visitMapDefType(MemberBaseType, TypeId);
- } else {
- visitTypeEntry(MemberBaseType);
+ uint32_t TmpId;
+ switch (Ty->getTag()) {
+ case dwarf::DW_TAG_typedef:
+ case dwarf::DW_TAG_const_type:
+ case dwarf::DW_TAG_volatile_type:
+ case dwarf::DW_TAG_restrict_type:
+ case dwarf::DW_TAG_pointer_type:
+ visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId);
+ break;
+ case dwarf::DW_TAG_array_type:
+ // Visit nested map array and jump to the element type
+ visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId);
+ break;
+ case dwarf::DW_TAG_structure_type: {
+ // Visit all struct members to ensure their types are visited.
+ const auto *CTy = cast<DICompositeType>(Ty);
+ const DINodeArray Elements = CTy->getElements();
+ for (const auto *Element : Elements) {
+ const auto *MemberType = cast<DIDerivedType>(Element);
+ const DIType *MemberBaseType = MemberType->getBaseType();
+ // If the member is a composite type, that may indicate the currently
+ // visited composite type is a wrapper, and the member represents the
+ // actual map definition.
+ // In that case, visit the member with `visitMapDefType` instead of
+ // `visitTypeEntry`, treating it specifically as a map definition rather
+ // than as a regular composite type.
+ const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
+ if (MemberCTy) {
+ visitMapDefType(MemberBaseType, TmpId);
+ } else {
+ visitTypeEntry(MemberBaseType);
+ }
}
+ break;
+ }
+ default:
+ break;
}
// Visit this type, struct or a const/typedef/volatile/restrict type
- visitTypeEntry(OrigTy, TypeId, false, false);
+ visitTypeEntry(Ty, TypeId, false, false);
}
/// Read file contents from the actual file or from the source
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index c86fa2b..54c3cea 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
const Function &F = MF.getFunction();
- bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize);
+ bool OptForSize = F.hasOptSize();
// Combine aggressively (for code size)
ShouldCombineAggressively =
diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp
index 6eccf80..9d7776d 100644
--- a/llvm/lib/Target/Hexagon/HexagonMask.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp
@@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) {
HII = HST.getInstrInfo();
const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.hasOptSize())
return false;
// Mask instruction is available only from v66
if (!HST.hasV66Ops())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index d96136c..a5bf0e5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2621,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue
LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
- if (isa<ConstantSDNode>(Op->getOperand(2)))
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+
+ if (isa<ConstantSDNode>(Op2))
return Op;
- return SDValue();
+
+ MVT IdxTy = MVT::getIntegerVT(EltSizeInBits);
+ MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts);
+
+ if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy))
+ return SDValue();
+
+ SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1);
+ SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2);
+
+ SmallVector<SDValue, 32> RawIndices;
+ for (unsigned i = 0; i < NumElts; ++i)
+ RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
+ SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices);
+
+ // insert vec, elt, idx
+ // =>
+ // select (splatidx == {0,1,2...}) ? splatelt : vec
+ SDValue SelectCC =
+ DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ);
+ return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0);
}
SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index ec6b382..881ba8e 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3341,6 +3341,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
+ const CallBase *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3397,8 +3398,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned StackSize = CCInfo.getStackSize();
- // Call site info for function parameters tracking.
+ // Call site info for function parameters tracking and call base type info.
MachineFunction::CallSiteInfo CSInfo;
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
// Check if it's really possible to do a tail call. Restrict it to functions
// that are part of this compilation unit.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 65e7c56..95abcde 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
CodeGenOptLevel OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm) {
- doMulWide = (OptLevel > CodeGenOptLevel::None);
-}
+ : SelectionDAGISel(tm, OptLevel), TM(tm) {}
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
@@ -145,18 +143,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryStoreVector(N))
return;
break;
- case NVPTXISD::LoadParam:
- case NVPTXISD::LoadParamV2:
- case NVPTXISD::LoadParamV4:
- if (tryLoadParam(N))
- return;
- break;
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- if (tryStoreParam(N))
- return;
- break;
case ISD::INTRINSIC_W_CHAIN:
if (tryIntrinsicChain(N))
return;
@@ -1462,267 +1448,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return true;
}
-bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
- SDValue Chain = Node->getOperand(0);
- SDValue Offset = Node->getOperand(2);
- SDValue Glue = Node->getOperand(3);
- SDLoc DL(Node);
- MemSDNode *Mem = cast<MemSDNode>(Node);
-
- unsigned VecSize;
- switch (Node->getOpcode()) {
- default:
- return false;
- case NVPTXISD::LoadParam:
- VecSize = 1;
- break;
- case NVPTXISD::LoadParamV2:
- VecSize = 2;
- break;
- case NVPTXISD::LoadParamV4:
- VecSize = 4;
- break;
- }
-
- EVT EltVT = Node->getValueType(0);
- EVT MemVT = Mem->getMemoryVT();
-
- std::optional<unsigned> Opcode;
-
- switch (VecSize) {
- default:
- return false;
- case 1:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
- NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64);
- break;
- case 2:
- Opcode =
- pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
- NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
- NVPTX::LoadParamMemV2I64);
- break;
- case 4:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
- NVPTX::LoadParamMemV4I32, {/* no v4i64 */});
- break;
- }
- if (!Opcode)
- return false;
-
- SDVTList VTs;
- if (VecSize == 1) {
- VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
- } else if (VecSize == 2) {
- VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
- } else {
- EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
- VTs = CurDAG->getVTList(EVTs);
- }
-
- unsigned OffsetVal = Offset->getAsZExtVal();
-
- SmallVector<SDValue, 2> Ops(
- {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
- return true;
-}
-
-// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
-#define getOpcV2H(ty, opKind0, opKind1) \
- NVPTX::StoreParamV2##ty##_##opKind0##opKind1
-
-#define getOpcV2H1(ty, opKind0, isImm1) \
- (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
-
-#define getOpcodeForVectorStParamV2(ty, isimm) \
- (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
-
-#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
- NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
-
-#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
- (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
- : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
-
-#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
- (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
- : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
-
-#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
- (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
- : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
-
-#define getOpcodeForVectorStParamV4(ty, isimm) \
- (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
- : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
-
-#define getOpcodeForVectorStParam(n, ty, isimm) \
- (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
- : getOpcodeForVectorStParamV4(ty, isimm)
-
-static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
- unsigned NumElts,
- MVT::SimpleValueType MemTy,
- SelectionDAG *CurDAG, SDLoc DL) {
- // Determine which inputs are registers and immediates make new operators
- // with constant values
- SmallVector<bool, 4> IsImm(NumElts, false);
- for (unsigned i = 0; i < NumElts; i++) {
- IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
- if (IsImm[i]) {
- SDValue Imm = Ops[i];
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[i] = Imm;
- }
- }
-
- // Get opcode for MemTy, size, and register/immediate operand ordering
- switch (MemTy) {
- case MVT::i8:
- return getOpcodeForVectorStParam(NumElts, I8, IsImm);
- case MVT::i16:
- return getOpcodeForVectorStParam(NumElts, I16, IsImm);
- case MVT::i32:
- return getOpcodeForVectorStParam(NumElts, I32, IsImm);
- case MVT::i64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(I64, IsImm);
- case MVT::f32:
- return getOpcodeForVectorStParam(NumElts, F32, IsImm);
- case MVT::f64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(F64, IsImm);
-
- // These cases don't support immediates, just use the all register version
- // and generate moves.
- case MVT::i1:
- return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
- : NVPTX::StoreParamV4I8_rrrr;
- case MVT::f16:
- case MVT::bf16:
- return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
- : NVPTX::StoreParamV4I16_rrrr;
- case MVT::v2f16:
- case MVT::v2bf16:
- case MVT::v2i16:
- case MVT::v4i8:
- return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
- : NVPTX::StoreParamV4I32_rrrr;
- default:
- llvm_unreachable("Cannot select st.param for unknown MemTy");
- }
-}
-
-bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
- SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- SDValue Param = N->getOperand(1);
- unsigned ParamVal = Param->getAsZExtVal();
- SDValue Offset = N->getOperand(2);
- unsigned OffsetVal = Offset->getAsZExtVal();
- MemSDNode *Mem = cast<MemSDNode>(N);
- SDValue Glue = N->getOperand(N->getNumOperands() - 1);
-
- // How many elements do we have?
- unsigned NumElts;
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode");
- case NVPTXISD::StoreParam:
- NumElts = 1;
- break;
- case NVPTXISD::StoreParamV2:
- NumElts = 2;
- break;
- case NVPTXISD::StoreParamV4:
- NumElts = 4;
- break;
- }
-
- // Build vector of operands
- SmallVector<SDValue, 8> Ops;
- for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(N->getOperand(i + 3));
- Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
- CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- // Determine target opcode
- // If we have an i1, use an 8-bit store. The lowering code in
- // NVPTXISelLowering will have already emitted an upcast.
- std::optional<unsigned> Opcode;
- switch (NumElts) {
- default:
- llvm_unreachable("Unexpected NumElts");
- case 1: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- SDValue Imm = Ops[0];
- if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
- (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
- // Convert immediate to target constant
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[0] = Imm;
- // Use immediate version of store param
- Opcode =
- pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i,
- NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i);
- } else
- Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
- NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
- NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r);
- if (Opcode == NVPTX::StoreParamI8_r) {
- // Fine tune the opcode depending on the size of the operand.
- // This helps to avoid creating redundant COPY instructions in
- // InstrEmitter::AddRegisterOperand().
- switch (Ops[0].getSimpleValueType().SimpleTy) {
- default:
- break;
- case MVT::i32:
- Opcode = NVPTX::StoreParamI8TruncI32_r;
- break;
- case MVT::i64:
- Opcode = NVPTX::StoreParamI8TruncI64_r;
- break;
- }
- }
- break;
- }
- case 2:
- case 4: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
- break;
- }
- }
-
- SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
-
- ReplaceNode(N, Ret);
- return true;
-}
-
/// SelectBFE - Look for instruction sequences that can be made more efficient
/// by using the 'bfe' (bit-field extract) PTX instruction
bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b99b4ef..9e0f88e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -40,9 +40,6 @@ private:
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
- // If true, generate mul.wide from sext and mul
- bool doMulWide;
-
NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
bool usePrecSqrtF32(const SDNode *N) const;
bool useF32FTZ() const;
@@ -78,8 +75,6 @@ private:
bool tryLDG(MemSDNode *N);
bool tryStore(SDNode *N);
bool tryStoreVector(SDNode *N);
- bool tryLoadParam(SDNode *N);
- bool tryStoreParam(SDNode *N);
bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryBFE(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ddcecc00..4fd3623 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
- ISD::STORE});
+ ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
MAKE_CASE(NVPTXISD::CALL)
- MAKE_CASE(NVPTXISD::LoadParam)
- MAKE_CASE(NVPTXISD::LoadParamV2)
- MAKE_CASE(NVPTXISD::LoadParamV4)
- MAKE_CASE(NVPTXISD::StoreParam)
- MAKE_CASE(NVPTXISD::StoreParamV2)
- MAKE_CASE(NVPTXISD::StoreParamV4)
MAKE_CASE(NVPTXISD::MoveParam)
MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
MAKE_CASE(NVPTXISD::BUILD_VECTOR)
@@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
-static bool adjustElementType(EVT &ElementType) {
- switch (ElementType.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::f16:
- case MVT::bf16:
- ElementType = MVT::i16;
- return true;
- case MVT::f32:
- case MVT::v2f16:
- case MVT::v2bf16:
- ElementType = MVT::i32;
- return true;
- case MVT::f64:
- ElementType = MVT::i64;
- return true;
- }
-}
-
-// Use byte-store when the param address of the argument value is unaligned.
-// This may happen when the return value is a field of a packed structure.
-//
-// This is called in LowerCall() when passing the param values.
-static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
- uint64_t Offset, EVT ElementType,
- SDValue StVal, SDValue &InGlue,
- unsigned ArgID, const SDLoc &dl) {
- // Bit logic only works on integer types
- if (adjustElementType(ElementType))
- StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
-
- // Store each byte
- SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- // Shift the byte to the last byte position
- SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
- DAG.getConstant(i * 8, dl, MVT::i32));
- SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- ShiftVal, InGlue};
- // Trunc store only the last byte by using
- // st.param.b8
- // The register type can be larger than b8.
- Chain = DAG.getMemIntrinsicNode(
- NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
- MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
- }
- return Chain;
-}
-
-// Use byte-load when the param adress of the returned value is unaligned.
-// This may happen when the returned value is a field of a packed structure.
-static SDValue
-LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
- EVT ElementType, SDValue &InGlue,
- SmallVectorImpl<SDValue> &TempProxyRegOps,
- const SDLoc &dl) {
- // Bit logic only works on integer types
- EVT MergedType = ElementType;
- adjustElementType(MergedType);
-
- // Load each byte and construct the whole value. Initial value to 0
- SDValue RetVal = DAG.getConstant(0, dl, MergedType);
- // LoadParamMemI8 loads into i16 register only
- SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- InGlue};
- // This will be selected to LoadParamMemI8
- SDValue LdVal =
- DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
- MVT::i8, MachinePointerInfo(), Align(1));
- SDValue TmpLdVal = LdVal.getValue(0);
- Chain = LdVal.getValue(1);
- InGlue = LdVal.getValue(2);
-
- TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
- TmpLdVal.getSimpleValueType(), TmpLdVal);
- TempProxyRegOps.push_back(TmpLdVal);
-
- SDValue CMask = DAG.getConstant(255, dl, MergedType);
- SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
- // Need to extend the i16 register to the whole width.
- TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
- // Mask off the high bits. Leave only the lower 8bits.
- // Do this because we are using loadparam.b8.
- TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
- // Shift and merge
- TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
- RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
- }
- if (ElementType != MergedType)
- RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
-
- return RetVal;
-}
-
static bool shouldConvertToIndirectCall(const CallBase *CB,
const GlobalAddressSDNode *Func) {
if (!Func)
@@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SelectionDAG &DAG = CLI.DAG;
SDLoc dl = CLI.DL;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
+ const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Callee = CLI.Callee;
- bool &isTailCall = CLI.IsTailCall;
ArgListTy &Args = CLI.getArgs();
Type *RetTy = CLI.RetTy;
const CallBase *CB = CLI.CB;
@@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return DAG.getConstant(I, dl, MVT::i32);
};
+ const unsigned UniqueCallSite = GlobalUniqueCallSite++;
+ const SDValue CallChain = CLI.Chain;
+ const SDValue StartChain =
+ DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
+ SDValue DeclareGlue = StartChain.getValue(1);
+
+ SmallVector<SDValue, 16> CallPrereqs{StartChain};
+
+ const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
+ // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
+ // loaded/stored using i16, so it's handled here as well.
+ const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
+ SDValue Declare =
+ DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
+
+ const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
+ unsigned Size) {
+ SDValue Declare = DAG.getNode(
+ NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
+
// Variadic arguments.
//
// Normally, for each argument, we declare a param scalar or a param
@@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
//
// After all vararg is processed, 'VAOffset' holds the size of the
// vararg byte array.
+ assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
+ "Non-VarArg function with extra arguments");
- SDValue VADeclareParam; // vararg byte array
const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
- unsigned VAOffset = 0; // current offset in the param array
+ unsigned VAOffset = 0; // current offset in the param array
- const unsigned UniqueCallSite = GlobalUniqueCallSite++;
- SDValue TempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
- SDValue InGlue = Chain.getValue(1);
+ const SDValue VADeclareParam =
+ CLI.Args.size() > FirstVAArg
+ ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
+ Align(STI.getMaxRequiredAlignment()), 0)
+ : SDValue();
// Args.size() and Outs.size() need not match.
// Outs.size() will be larger
@@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
"type size mismatch");
- const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
- if (IsVAArg) {
- if (ArgI == FirstVAArg) {
- VADeclareParam = DAG.getNode(
- NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
- GetI32(0), InGlue});
- return VADeclareParam;
- }
- return std::nullopt;
- }
- if (IsByVal || shouldPassAsArray(Arg.Ty)) {
- // declare .param .align <align> .b8 .param<n>[<size>];
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(ArgAlign.value()),
- GetI32(TypeSize), InGlue});
- }
+ const SDValue ArgDeclare = [&]() {
+ if (IsVAArg)
+ return VADeclareParam;
+
+ if (IsByVal || shouldPassAsArray(Arg.Ty))
+ return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+
assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
- // declare .param .b<size> .param<n>;
-
- // PTX ABI requires integral types to be at least 32 bits in
- // size. FP16 is loaded/stored using i16, so it's handled
- // here as well.
- const unsigned PromotedSize =
- (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint())
- ? promoteScalarArgumentSize(TypeSize * 8)
- : TypeSize * 8;
-
- return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
+ assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
+ "Only int and float types are supported as non-array arguments");
+
+ return MakeDeclareScalarParam(ParamSymbol, TypeSize);
}();
- if (ArgDeclare) {
- Chain = ArgDeclare->getValue(0);
- InGlue = ArgDeclare->getValue(1);
- }
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
// than 32-bits are sign extended or zero extended, depending on
@@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
- const Align PartAlign) {
- SDValue StVal;
+ const MaybeAlign PartAlign) {
if (IsByVal) {
SDValue Ptr = ArgOutVals[0];
auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
SDValue SrcAddr =
DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
- StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign);
- } else {
- StVal = ArgOutVals[I];
-
- auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
- if (PromotedVT != StVal.getValueType()) {
- StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
- StVal);
- }
+ return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
}
+ SDValue StVal = ArgOutVals[I];
+ assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+ StVal.getValueType() &&
+ "OutVal type should always be legal");
- if (ExtendIntegerParam) {
- assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
- // zext/sext to i32
- StVal =
- DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal);
- } else if (EltVT.getSizeInBits() < 16) {
- // Use 16-bit registers for small stores as it's the
- // smallest general purpose register size supported by NVPTX.
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
- return StVal;
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT StoreVT =
+ ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+ return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
};
const auto VectorInfo =
@@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned J = 0;
for (const unsigned NumElts : VectorInfo) {
const int CurOffset = Offsets[J];
- EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
- const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
-
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar store. In such cases, fall back to byte stores.
- if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
-
- SDValue StVal = GetStoredValue(J, EltVT, PartAlign);
- Chain = LowerUnalignedStoreParam(DAG, Chain,
- CurOffset + (IsByVal ? VAOffset : 0),
- EltVT, StVal, InGlue, ArgI, dl);
-
- // LowerUnalignedStoreParam took care of inserting the necessary nodes
- // into the SDAG, so just move on to the next element.
- J++;
- continue;
- }
+ const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
if (IsVAArg && !IsByVal)
// Align each part of the variadic argument to their type.
@@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((IsVAArg || VAOffset == 0) &&
"VAOffset must be 0 for non-VA args");
- SmallVector<SDValue, 6> StoreOperands{
- Chain, GetI32(IsVAArg ? FirstVAArg : ArgI),
- GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))};
- // Record the values to store.
- for (const unsigned K : llvm::seq(NumElts))
- StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign));
- StoreOperands.push_back(InGlue);
+ const unsigned Offset =
+ (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreParam;
- break;
- case 2:
- Op = NVPTXISD::StoreParamV2;
- break;
- case 4:
- Op = NVPTXISD::StoreParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
+ const MaybeAlign CurrentAlign = ExtendIntegerParam
+ ? MaybeAlign(std::nullopt)
+ : commonAlignment(ArgAlign, Offset);
+
+ SDValue Val;
+ if (NumElts == 1) {
+ Val = GetStoredValue(J, EltVT, CurrentAlign);
+ } else {
+ SmallVector<SDValue, 8> StoreVals;
+ for (const unsigned K : llvm::seq(NumElts)) {
+ SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
+ if (ValJ.getValueType().isVector())
+ DAG.ExtractVectorElements(ValJ, StoreVals);
+ else
+ StoreVals.push_back(ValJ);
+ }
+
+ EVT VT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
+ Val = DAG.getBuildVector(VT, dl, StoreVals);
}
- // Adjust type of the store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), PartAlign,
- MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
+ SDValue StoreParam =
+ DAG.getStore(ArgDeclare, dl, Val, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+ CallPrereqs.push_back(StoreParam);
// TODO: We may need to support vector types that can be passed
// as scalars in variadic arguments.
if (IsVAArg && !IsByVal) {
assert(NumElts == 1 &&
"Vectorization is expected to be disabled for variadics.");
+ const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
VAOffset +=
DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
}
@@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
VAOffset += TypeSize;
}
- GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-
// Handle Result
if (!Ins.empty()) {
- const SDValue RetDeclare = [&]() {
- const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
- const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
- if (shouldPassAsArray(RetTy)) {
- const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(RetAlign.value()),
- GetI32(ResultSize / 8), InGlue});
- }
- const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
- return DAG.getNode(
- NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
- }();
- Chain = RetDeclare.getValue(0);
- InGlue = RetDeclare.getValue(1);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
+ if (shouldPassAsArray(RetTy)) {
+ const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+ MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
+ } else {
+ MakeDeclareScalarParam(RetSymbol, ResultSize);
+ }
}
- const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
// Set the size of the vararg param byte array if the callee is a variadic
// function and the variadic part is not empty.
- if (HasVAArgs) {
+ if (VADeclareParam) {
SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
VADeclareParam.getOperand(1),
VADeclareParam.getOperand(2), GetI32(VAOffset),
@@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
VADeclareParam->getVTList(), DeclareParamOps);
}
+ const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
// If the type of the callsite does not match that of the function, convert
// the callsite to an indirect call.
const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
@@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// instruction.
// The prototype is embedded in a string and put as the operand for a
// CallPrototype SDNode which will print out to the value of the string.
+ const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
std::string Proto =
getPrototype(DL, RetTy, Args, CLI.Outs,
HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
UniqueCallSite);
const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
- Chain = DAG.getNode(
- NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
- {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue});
- InGlue = Chain.getValue(1);
+ const SDValue PrototypeDeclare = DAG.getNode(
+ NVPTXISD::CallPrototype, dl, MVT::Other,
+ {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
+ CallPrereqs.push_back(PrototypeDeclare);
}
if (ConvertToIndirectCall) {
@@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const unsigned NumArgs =
std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
- Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue},
- {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
- GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee,
- GetI32(Proto), InGlue});
- InGlue = Chain.getValue(1);
-
+ /// NumParams, Callee, Proto)
+ const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
+ const SDValue Call = DAG.getNode(
+ NVPTXISD::CALL, dl, MVT::Other,
+ {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
+ GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
+
+ SmallVector<SDValue, 16> LoadChains{Call};
SmallVector<SDValue, 16> ProxyRegOps;
- // An item of the vector is filled if the element does not need a ProxyReg
- // operation on it and should be added to InVals as is. ProxyRegOps and
- // ProxyRegTruncates contain empty/none items at the same index.
- SmallVector<SDValue, 16> RetElts;
- // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
- // to use the values of `LoadParam`s and to be replaced later then
- // `CALLSEQ_END` is added.
- SmallVector<SDValue, 16> TempProxyRegOps;
-
- // Generate loads from param memory/moves from registers for result
if (!Ins.empty()) {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
@@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
- for (const unsigned VectorizedSize : VectorInfo) {
- EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
- EVT EltType = Ins[I].VT;
- const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
-
- if (TheLoadType != VTs[I])
- EltType = TheLoadType;
-
- if (ExtendIntegerRetVal) {
- TheLoadType = MVT::i32;
- EltType = MVT::i32;
- } else if (TheLoadType.getSizeInBits() < 16) {
- EltType = MVT::i16;
- }
+ for (const unsigned NumElts : VectorInfo) {
+ const MaybeAlign CurrentAlign =
+ ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
+ : commonAlignment(RetAlign, Offsets[I]);
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar load. In such cases, fall back to byte loads.
- if (VectorizedSize == 1 && RetTy->isAggregateType() &&
- EltAlign < DAG.getEVTAlign(TheLoadType)) {
- SDValue Ret = LowerUnalignedLoadRetParam(
- DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl);
- ProxyRegOps.push_back(SDValue());
- RetElts.resize(I);
- RetElts.push_back(Ret);
-
- I++;
- continue;
- }
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT LoadVT =
+ ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
- SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType);
- LoadVTs.append({MVT::Other, MVT::Glue});
+ const unsigned PackingAmt =
+ LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
- NVPTXISD::NodeType Op;
- switch (VectorizedSize) {
- case 1:
- Op = NVPTXISD::LoadParam;
- break;
- case 2:
- Op = NVPTXISD::LoadParamV2;
- break;
- case 4:
- Op = NVPTXISD::LoadParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
+ const EVT VecVT = NumElts == 1 ? LoadVT
+ : EVT::getVectorVT(*DAG.getContext(),
+ LoadVT.getScalarType(),
+ NumElts * PackingAmt);
- SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue};
- SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
- for (const unsigned J : llvm::seq(VectorizedSize)) {
- ProxyRegOps.push_back(RetVal.getValue(J));
- }
+ SDValue R =
+ DAG.getLoad(VecVT, dl, Call, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
- Chain = RetVal.getValue(VectorizedSize);
- InGlue = RetVal.getValue(VectorizedSize + 1);
+ LoadChains.push_back(R.getValue(1));
- I += VectorizedSize;
+ if (NumElts == 1)
+ ProxyRegOps.push_back(R);
+ else
+ for (const unsigned J : llvm::seq(NumElts)) {
+ SDValue Elt = DAG.getNode(
+ LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+ : ISD::EXTRACT_VECTOR_ELT,
+ dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
+ ProxyRegOps.push_back(Elt);
+ }
+ I += NumElts;
}
}
- Chain =
- DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
- InGlue = Chain.getValue(1);
+ const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
+ const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
+ UniqueCallSite + 1, SDValue(), dl);
// Append ProxyReg instructions to the chain to make sure that `callseq_end`
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
- for (const unsigned I : llvm::seq(ProxyRegOps.size())) {
- if (I < RetElts.size() && RetElts[I]) {
- InVals.push_back(RetElts[I]);
- continue;
- }
-
- SDValue Ret =
- DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
- {Chain, ProxyRegOps[I]});
-
- const EVT ExpectedVT = Ins[I].VT;
- if (!Ret.getValueType().bitsEq(ExpectedVT)) {
- Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret);
- }
+ for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
+ SDValue Proxy =
+ DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
+ SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
InVals.push_back(Ret);
}
- for (SDValue &T : TempProxyRegOps) {
- SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
- {Chain, T.getOperand(0)});
- DAG.ReplaceAllUsesWith(T, Repl);
- DAG.RemoveDeadNode(T.getNode());
- }
-
- // set isTailCall to false for now, until we figure out how to express
+ // set IsTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
- isTailCall = false;
- return Chain;
+ CLI.IsTailCall = false;
+ return CallEnd;
}
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
@@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
Operands.push_back(DCI.DAG.getIntPtrConstant(
cast<LoadSDNode>(LD)->getExtensionType(), DL));
break;
- case NVPTXISD::LoadParamV2:
- OldNumOutputs = 2;
- Opcode = NVPTXISD::LoadParamV4;
- break;
case NVPTXISD::LoadV2:
OldNumOutputs = 2;
Opcode = NVPTXISD::LoadV4;
@@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
- case NVPTXISD::StoreParam:
- Opcode = NVPTXISD::StoreParamV2;
- break;
- case NVPTXISD::StoreParamV2:
- Opcode = NVPTXISD::StoreParamV4;
- break;
case NVPTXISD::StoreV2:
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
@@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
return SDValue();
Opcode = NVPTXISD::StoreV8;
break;
- case NVPTXISD::StoreParamV4:
case NVPTXISD::StoreV8:
// PTX doesn't support the next doubling of operands
return SDValue();
@@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT, ST->getMemOperand());
}
-static SDValue PerformStoreCombineHelper(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- unsigned Front, unsigned Back) {
- if (all_of(N->ops().drop_front(Front).drop_back(Back),
- [](const SDUse &U) { return U.get()->isUndef(); }))
- // Operand 0 is the previous value in the chain. Cannot return EntryToken
- // as the previous value will become unused and eliminated later.
- return N->getOperand(0);
-
- return combinePackingMovIntoStore(N, DCI, Front, Back);
-}
-
static SDValue PerformStoreCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
return combinePackingMovIntoStore(N, DCI, 1, 2);
}
-static SDValue PerformStoreParamCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- // Operands from the 3rd to the 2nd last one are the values to be stored.
- // {Chain, ArgID, Offset, Val, Glue}
- return PerformStoreCombineHelper(N, DCI, 3, 1);
-}
-
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
@@ -5432,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N,
return SDValue();
}
+// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
+static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.hasOneUse())
+ return SDValue();
+ EVT ToVT = N->getValueType(0);
+ EVT FromVT = Op.getValueType();
+ if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
+ (ToVT == MVT::i64 && FromVT == MVT::i32)))
+ return SDValue();
+ if (!(Op.getOpcode() == ISD::MUL ||
+ (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned ExtOpcode = N->getOpcode();
+ unsigned Opcode = 0;
+ if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_SIGNED;
+ else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_UNSIGNED;
+ else
+ return SDValue();
+ SDValue RHS = Op.getOperand(1);
+ if (Op.getOpcode() == ISD::SHL) {
+ const auto ShiftAmt = Op.getConstantOperandVal(1);
+ const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
+ }
+ return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
+}
+
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -5942,6 +5765,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
N->getConstantOperandAPInt(2),
N->getConstantOperandVal(3)),
SDLoc(N), N->getValueType(0));
+ return SDValue();
+}
+
+// During call lowering we wrap the return values in a ProxyReg node which
+// depend on the chain value produced by the completed call. This ensures that
+// the full call is emitted in cases where libcalls are used to legalize
+// operations. To improve the functioning of other DAG combines we pull all
+// operations we can through one of these nodes, ensuring that the ProxyReg
+// directly wraps a load. That is:
+//
+// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
+//
+static SDValue sinkProxyReg(SDValue R, SDValue Chain,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ switch (R.getOpcode()) {
+ case ISD::TRUNCATE:
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::BITCAST: {
+ if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
+ return SDValue();
+ }
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::OR: {
+ if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
+ return SDValue();
+ }
+ case ISD::Constant:
+ return R;
+ case ISD::LOAD:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4: {
+ return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
+ {Chain, R});
+ }
+ case ISD::BUILD_VECTOR: {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops;
+ for (auto &Op : R->ops()) {
+ SDValue V = sinkProxyReg(Op, Chain, DCI);
+ if (!V)
+ return SDValue();
+ Ops.push_back(V);
+ }
+ return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
+ }
+ case ISD::EXTRACT_VECTOR_ELT: {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R),
+ R.getValueType(), V, R.getOperand(1));
+ return SDValue();
+ }
+ default:
+ return SDValue();
+ }
+}
+
+static SDValue combineProxyReg(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ // If the ProxyReg is not wrapping a load, try to pull the operations through
+ // the ProxyReg.
+ if (Reg.getOpcode() != ISD::LOAD) {
+ if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
+ return V;
+ }
return SDValue();
}
@@ -5958,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return combineADDRSPACECAST(N, DCI);
case ISD::AND:
return PerformANDCombine(N, DCI);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return combineMulWide(N, DCI, OptLevel);
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
@@ -5965,7 +5871,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FADD:
return PerformFADDCombine(N, DCI, OptLevel);
case ISD::LOAD:
- case NVPTXISD::LoadParamV2:
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
return combineUnpackingMovIntoLoad(N, DCI);
@@ -5973,6 +5878,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformMULCombine(N, DCI, OptLevel);
case NVPTXISD::PRMT:
return combinePRMT(N, DCI, OptLevel);
+ case NVPTXISD::ProxyReg:
+ return combineProxyReg(N, DCI);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::SHL:
@@ -5980,10 +5887,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SREM:
case ISD::UREM:
return PerformREMCombine(N, DCI, OptLevel);
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N, DCI);
case ISD::STORE:
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
@@ -6332,6 +6235,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
Results.push_back(NewValue.getValue(3));
}
+static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
+
+ SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
+ SDValue NewProxy =
+ DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
+ SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
+
+ Results.push_back(Res);
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6349,6 +6268,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::CopyFromReg:
ReplaceCopyFromReg_128(N, DAG, Results);
return;
+ case NVPTXISD::ProxyReg:
+ replaceProxyReg(N, DAG, *this, Results);
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 228e2aa..cf72a1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -38,7 +38,7 @@ enum NodeType : unsigned {
/// This node represents a PTX call instruction. It's operands are as follows:
///
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
CALL,
MoveParam,
@@ -84,13 +84,7 @@ enum NodeType : unsigned {
StoreV2,
StoreV4,
StoreV8,
- LoadParam,
- LoadParamV2,
- LoadParamV4,
- StoreParam,
- StoreParamV2,
- StoreParamV4,
- LAST_MEMORY_OPCODE = StoreParamV4,
+ LAST_MEMORY_OPCODE = StoreV8,
};
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 442b900..6000b40 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
-def doMulWide : Predicate<"doMulWide">;
-
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
@@ -836,36 +834,28 @@ def MULWIDES64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">;
def MULWIDES64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">;
-def MULWIDES64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">;
def MULWIDEU64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">;
def MULWIDEU64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">;
-def MULWIDEU64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">;
def MULWIDES32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">;
def MULWIDES32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">;
-def MULWIDES32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">;
def MULWIDEU32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">;
def MULWIDEU32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">;
-def MULWIDEU32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">;
-def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>;
// Matchers for signed, unsigned mul.wide ISD nodes.
-let Predicates = [doMulWide] in {
+let Predicates = [hasOptEnabled] in {
def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
@@ -877,85 +867,6 @@ let Predicates = [doMulWide] in {
def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
}
-// Predicates used for converting some patterns to mul.wide.
-def SInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(32);
-}]>;
-
-def UInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(32);
-}]>;
-
-def SInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(16);
-}]>;
-
-def UInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(16);
-}]>;
-
-def IntConst_0_30 : PatLeaf<(imm), [{
- // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(31);
-}]>;
-
-def IntConst_0_14 : PatLeaf<(imm), [{
- // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(15);
-}]>;
-
-def SHL2MUL32 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(32, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
-}]>;
-
-def SHL2MUL16 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(16, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
-}]>;
-
-// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-let Predicates = [doMulWide] in {
- def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDES64Imm $a, (SHL2MUL32 $b))>;
- def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDEU64Imm $a, (SHL2MUL32 $b))>;
-
- def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDES32Imm $a, (SHL2MUL16 $b))>;
- def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDEU32Imm $a, (SHL2MUL16 $b))>;
-
- // Convert "sign/zero-extend then multiply" to mul.wide.
- def : Pat<(mul (sext i32:$a), (sext i32:$b)),
- (MULWIDES64 $a, $b)>;
- def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>;
-
- def : Pat<(mul (zext i32:$a), (zext i32:$b)),
- (MULWIDEU64 $a, $b)>;
- def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>;
-
- def : Pat<(mul (sext i16:$a), (sext i16:$b)),
- (MULWIDES32 $a, $b)>;
- def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>;
-
- def : Pat<(mul (zext i16:$a), (zext i16:$b)),
- (MULWIDEU32 $a, $b)>;
- def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>;
-}
-
//
// Integer multiply-add
//
@@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>;
defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>;
}
+multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> {
+ def rrr:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>;
+ def rri:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>;
+ def rir:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>;
+ def rii:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>;
+}
+
+def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>;
+def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>;
+
+let Predicates = [hasOptEnabled] in {
+defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>;
+defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>;
+}
+
foreach t = [I16RT, I32RT, I64RT] in {
def NEG_S # t.Size :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
@@ -1516,20 +1460,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN
// Byte extraction via shift/trunc/sext
-def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
- (CVT_s8_s32 $s, CvtNONE)>;
-def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
+def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>;
+
+def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>;
+def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>;
+
+def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
(CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
-def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8),
- (BFE_S32rii $s, imm:$o, 8)>;
+def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
+ (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
+
def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
(CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
-def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8),
- (BFE_S64rii $s, imm:$o, 8)>;
-def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
- (CVT_s8_s64 $s, CvtNONE)>;
-def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
- (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
//-----------------------------------
// Comparison instructions (setp, set)
@@ -1713,56 +1656,39 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
//-----------------------------------
// Comparison and Selection
//-----------------------------------
+// TODO: These patterns seem very specific and brittle. We should try to find
+// a more general solution.
def cond_signed : PatLeaf<(cond), [{
return isSignedIntSetCC(N->get());
}]>;
-def cond_not_signed : PatLeaf<(cond), [{
- return !isSignedIntSetCC(N->get());
-}]>;
+// A 16-bit signed comparison of sign-extended byte extracts can be converted
+// to 32-bit comparison if we change the PRMT to sign-extend the extracted
+// bytes.
+def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
+ (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
+ cond_signed:$cc),
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
+ (cond2cc $cc))>;
+
+// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
+// comparison because we know that the truncate is just trancating off zeros
+// and that the most-significant byte is also zeros so the meaning of signed and
+// unsigned comparisons will not be changed.
+def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+ (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
+ cond:$cc),
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-// comparisons of i8 extracted with PRMT as i32
-// It's faster to do comparison directly on i32 extracted by PRMT,
-// instead of the long conversion and sign extending.
-def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
- (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
- cond_signed:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
-
-def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
- (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
- cond_signed:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
-
-def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
- (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
- cond_signed:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
-
-def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
- (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
- cond_not_signed:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParam :
SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -1774,104 +1700,20 @@ def declare_array_param :
def declare_scalar_param :
SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-def LoadParam :
- SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
- SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
- SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def StoreParam :
- SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
- SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
- SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def MoveParam :
SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
def proxy_reg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
def SDTCallProfile : SDTypeProfile<0, 6,
[SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>,
SDTCisVT<3, i32>, SDTCisVT<5, i32>]>;
-def call :
- SDNode<"NVPTXISD::CALL", SDTCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-let mayLoad = true in {
- class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
- !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
- []>;
-
- class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
- !strconcat("ld.param.v2", opstr,
- " \t{{$dst, $dst2}}, [retval0$b];"), []>;
-
- class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins Offseti32imm:$b),
- !strconcat("ld.param.v4", opstr,
- " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
- []>;
-}
-
-let mayStore = true in {
-
- multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
- foreach op = [IMMType, regclass] in
- if !or(support_imm, !isa<NVPTXRegClass>(op)) then
- def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
- : NVPTXInst<(outs),
- (ins op:$val, i32imm:$a, Offseti32imm:$b),
- "st.param" # opstr # " \t[param$a$b], $val;",
- []>;
- }
-
- multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
- []>;
- }
-
- multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- foreach op3 = [IMMType, regclass] in
- foreach op4 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- # !if(!isa<NVPTXRegClass>(op3), "r", "i")
- # !if(!isa<NVPTXRegClass>(op4), "r", "i")
-
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v4" # opstr #
- " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
- []>;
- }
-}
+def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-/// NumParams, Callee, Proto, InGlue)
+/// NumParams, Callee, Proto)
def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; }
@@ -1908,43 +1750,6 @@ foreach is_convergent = [0, 1] in {
(call_uni_inst $addr, imm:$rets, imm:$params)>;
}
-def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">;
-def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">;
-def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">;
-def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">;
-def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">;
-def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">;
-def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">;
-def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">;
-def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">;
-def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">;
-def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">;
-
-defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">;
-defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">;
-defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">;
-defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">;
-
-defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>;
-defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>;
-
-defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">;
-defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">;
-defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">;
-defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">;
-
-defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">;
-defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">;
-defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">;
-
-defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">;
-defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">;
-
-defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">;
-defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">;
-
-defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">;
-
def DECLARE_PARAM_array :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
".param .align $align .b8 \t$a[$size];", []>;
@@ -1957,6 +1762,18 @@ def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size),
def : Pat<(declare_scalar_param externalsym:$a, imm:$size),
(DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>;
+// Call prototype wrapper, this is a dummy instruction that just prints it's
+// operand which is string defining the prototype.
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype :
+ SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> { let PrintMethod = "printProtoIdent"; }
+def CALL_PROTOTYPE :
+ NVPTXInst<(outs), (ins ProtoIdent:$ident),
+ "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
foreach t = [I32RT, I64RT] in {
defvar inst_name = "MOV" # t.Size # "_PARAM";
def inst_name : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), "mov.b" # t.Size>;
@@ -1976,6 +1793,32 @@ defm ProxyRegB16 : ProxyRegInst<"b16", B16>;
defm ProxyRegB32 : ProxyRegInst<"b32", B32>;
defm ProxyRegB64 : ProxyRegInst<"b64", B64>;
+
+// Callseq start and end
+
+// Note: these nodes are marked as SDNPMayStore and SDNPMayLoad because
+// they define the scope in which the declared params may be used. Therefore
+// we add these flags to ensure ld.param and st.param are not sunk or hoisted
+// out of that scope.
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>,
+ [SDNPHasChain, SDNPOutGlue,
+ SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>;
+
+def Callseq_Start :
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\{ // callseq $amt1, $amt2",
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
+def Callseq_End :
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\} // callseq $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+
//
// Load / Store Handling
//
@@ -2519,26 +2362,6 @@ def : Pat<(brcond i32:$a, bb:$target),
def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
(CBranchOther $a, bb:$target)>;
-// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
- SDTCisVT<1, i32>]>;
-def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
- SDNPSideEffect]>;
-
-def Callseq_Start :
- NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- "\\{ // callseq $amt1, $amt2",
- [(callseq_start timm:$amt1, timm:$amt2)]>;
-def Callseq_End :
- NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- "\\} // callseq $amt1",
- [(callseq_end timm:$amt1, timm:$amt2)]>;
-
// trap instruction
def trapinst : BasicNVPTXInst<(outs), (ins), "trap", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
@@ -2547,18 +2370,6 @@ def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[
// brkpt instruction
def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>;
-// Call prototype wrapper
-def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def CallPrototype :
- SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def ProtoIdent : Operand<i32> {
- let PrintMethod = "printProtoIdent";
-}
-def CALL_PROTOTYPE :
- NVPTXInst<(outs), (ins ProtoIdent:$ident),
- "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
-
def SDTDynAllocaOp :
SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5779d4e..0e8828f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -243,8 +243,6 @@ public:
createObjectTargetWriter() const override {
return createPPCXCOFFObjectWriter(TT.isArch64Bit());
}
-
- std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
};
} // end anonymous namespace
@@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
return std::nullopt;
}
-std::optional<MCFixupKind>
-XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const {
- return StringSwitch<std::optional<MCFixupKind>>(Name)
- .Case("R_REF", PPC::fixup_ppc_nofixup)
- .Default(std::nullopt);
-}
-
MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 9e8ee9f..df0c666 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -48,8 +48,7 @@ enum Fixups {
/// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
/// TLS general and local dynamic models, or inserts the thread-pointer
- /// register number. It can also be used to tie the ref symbol to prevent it
- /// from being garbage collected on AIX.
+ /// register number.
fixup_ppc_nofixup,
/// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index f75ab62..a04f404 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
switch ((unsigned)Fixup.getKind()) {
default:
report_fatal_error("Unimplemented fixup kind.");
+ case XCOFF::RelocationType::R_REF:
+ return {XCOFF::RelocationType::R_REF, 0};
case PPC::fixup_ppc_half16: {
const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
switch (Specifier) {
@@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
case PPC::fixup_ppc_br24abs:
return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
- case PPC::fixup_ppc_nofixup: {
- if (Specifier == PPC::S_None)
- return {XCOFF::RelocationType::R_REF, 0};
- else
- llvm_unreachable("Unsupported Modifier");
- } break;
case FK_Data_4:
case FK_Data_8:
const uint8_t SignAndSizeForFKData =
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index d295f35..1dc485d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
(COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
}
-class XXEvalPattern <dag pattern, bits<8> imm> :
- Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+// =============================================================================
+// XXEVAL Instruction Pattern Definitions
+// =============================================================================
+//
+// XXEVAL instruction performs 256 different logical operations on three vector
+// operands using an 8-bit immediate value to select the operation.
+// Format: xxeval XT, XA, XB, XC, IMM
+// For example:
+// Equivalent function A?xor(B,C):and(B,C) is performed by
+// xxeval XT, XA, XB, XC, 22
+//
+// REGISTER CLASS CONSTRAINTS:
+// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64]
+// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC
+// =============================================================================
+
+class XXEvalPattern<dag pattern, bits<8> imm>
+ : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+
+class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm>
+ : Pat<(Vt InputPattern),
+ !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)),
+ // VSRC path: direct XXEVAL for v4i32 and v2i64
+ (XXEVAL $vA, $vB, $vC, Imm),
+ // VRRC path: wrap with COPY_TO_REGCLASS for other types
+ (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC),
+ (COPY_TO_REGCLASS Vt:$vB, VSRC),
+ (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm),
+ VRRC))> {}
+
+// =============================================================================
+// PatFrags for Bitcast-Aware Vector bitwise Operations
+//
+// Each PatFrags defines TWO alternatives for pattern matcher to choose:
+// - Direct operation (for v4i32)
+// - Bitcast operation (for other types: v2i64, v16i8, v8i16)
+// =============================================================================
+
+// Basic Binary Operations
+def VAnd
+ : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b),
+ (bitconvert(and
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VXor
+ : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b),
+ (bitconvert(xor
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b),
+ (bitconvert(or
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b))))]>;
+
+def VNot
+ : PatFrags<(ops node:$a), [(vnot node:$a),
+ (bitconvert(vnot(v4i32(bitconvert node:$a))))]>;
+
+// Derived bitwise operations
+// Vector NOR operation (not(or))
+def VNor
+ : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)),
+ (bitconvert(vnot(or
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b)))))]>;
+
+// Vector EQV operation (not(xor))
+def VEqv
+ : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)),
+ (bitconvert(vnot(xor
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b)))))]>;
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT)
+// - AND(B,C) is the "false" case op on vectors B and C
+// =============================================================================
+multiclass XXEvalTernarySelectAnd<ValueType Vt> {
+ // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 22>;
+
+ // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 24>;
+
+ // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)),
+ 25>;
+
+ // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>;
+
+ // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28
+ def : XXEvalPatterns<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>;
+}
let Predicates = [PrefixInstrs, HasP10Vector] in {
let AddedComplexity = 400 in {
@@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
// (xor A, (or B, C))
def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
+ // XXEval Patterns for ternary Operations.
+ foreach Ty = [v4i32, v2i64, v8i16, v16i8] in {
+ defm : XXEvalTernarySelectAnd<Ty>;
+ }
+
// Anonymous patterns to select prefixed VSX loads and stores.
// Load / Store f128
def : Pat<(f128 (load PDForm:$src)),
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 5e54b82..67cc01e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
+ if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (Imm < RISCVZC::RA_S0)
+ return MCDisassembler::Fail;
+ return decodeZcmpRlist(Inst, Imm, Address, Decoder);
+}
+
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
+ uint64_t Address,
const MCDisassembler *Decoder);
static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
@@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
return S;
}
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
- if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(Imm));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (Imm < RISCVZC::RA_S0)
- return MCDisassembler::Fail;
- return decodeZcmpRlist(Inst, Imm, Address, Decoder);
-}
-
// Add implied SP operand for C.*SP compressed instructions. The SP operand
// isn't explicitly encoded in the instruction.
void RISCVDisassembler::addSPOperands(MCInst &MI) const {
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 4c303a9..da6b95d 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt,
// Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31.
def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt,
(sequence "X%u", 16, 31))>;
+
+def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>;
+def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs,
+ (sequence "X%u", 16, 31))>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 34910b7..f223fdbe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
// Transform (sra (shl X, C1) C2) with C1 < C2
// -> (SignedBitfieldExtract X, msb, lsb)
if (N0.getOpcode() == ISD::SHL) {
- auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N01C)
return false;
@@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
// Transform (sra (shl X, C1) C2) with C1 > C2
// -> (NDS.BFOS X, lsb, msb)
if (N0.getOpcode() == ISD::SHL) {
- auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N01C)
return false;
@@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C)
// where C2 has 32 leading zeros and C3 trailing zeros.
SDNode *SRLIW = CurDAG->getMachineNode(
- RISCV::SRLIW, DL, VT, N0->getOperand(0),
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(TrailingZeros, DL, VT));
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
@@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// - without Zba a tablegen pattern applies the very same
// transform as we would have done here
SDNode *SLLI = CurDAG->getMachineNode(
- RISCV::SLLI, DL, VT, N0->getOperand(0),
+ RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LeadingZeros, DL, VT));
SDNode *SRLI = CurDAG->getMachineNode(
RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
@@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
unsigned TrailingZeros = llvm::countr_zero(Mask);
if (LeadingZeros == 32 && TrailingZeros > ShAmt) {
SDNode *SRLIW = CurDAG->getMachineNode(
- RISCV::SRLIW, DL, VT, N0->getOperand(0),
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(TrailingZeros, DL, VT));
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
@@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (TrailingOnes == 32) {
SDNode *SRLI = CurDAG->getMachineNode(
Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT,
- N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
+ N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
ReplaceNode(Node, SRLI);
return;
}
@@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (HasBitTest && ShAmt + 1 == TrailingOnes) {
SDNode *BEXTI = CurDAG->getMachineNode(
Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT,
- N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
+ N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT));
ReplaceNode(Node, BEXTI);
return;
}
const unsigned Msb = TrailingOnes - 1;
const unsigned Lsb = ShAmt;
- if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb))
+ if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb))
return;
unsigned LShAmt = Subtarget->getXLen() - TrailingOnes;
SDNode *SLLI =
- CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LShAmt, DL, VT));
SDNode *SRLI = CurDAG->getMachineNode(
RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
@@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
break;
unsigned LShAmt = Subtarget->getXLen() - ExtSize;
SDNode *SLLI =
- CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LShAmt, DL, VT));
SDNode *SRAI = CurDAG->getMachineNode(
RISCV::SRAI, DL, VT, SDValue(SLLI, 0),
@@ -2942,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
/// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
SDValue &Offset) {
- // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
- // a 9-bit immediate can be folded.
+ if (SelectAddrFrameIndex(Addr, Base, Offset))
+ return true;
SDLoc DL(Addr);
MVT VT = Addr.getSimpleValueType();
@@ -2953,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
if (isUInt<9>(CVal)) {
Base = Addr.getOperand(0);
- // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
- // a 9-bit immediate can be folded.
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 54845e5..c0ada51 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20730,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getAllOnesConstant(DL, VT);
return DAG.getConstant(0, DL, VT);
}
+ case Intrinsic::riscv_vsseg2_mask:
+ case Intrinsic::riscv_vsseg3_mask:
+ case Intrinsic::riscv_vsseg4_mask:
+ case Intrinsic::riscv_vsseg5_mask:
+ case Intrinsic::riscv_vsseg6_mask:
+ case Intrinsic::riscv_vsseg7_mask:
+ case Intrinsic::riscv_vsseg8_mask: {
+ SDValue Tuple = N->getOperand(2);
+ unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+
+ if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() ||
+ Tuple.getOpcode() != RISCVISD::TUPLE_INSERT ||
+ !Tuple.getOperand(0).isUndef())
+ return SDValue();
+
+ SDValue Val = Tuple.getOperand(1);
+ unsigned Idx = Tuple.getConstantOperandVal(2);
+
+ unsigned SEW = Val.getValueType().getScalarSizeInBits();
+ assert(Log2_64(SEW) == N->getConstantOperandVal(6) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/N->getOperand(0),
+ /*IntID=*/
+ DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT),
+ /*StoredVal=*/Val,
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/N->getOperand(4),
+ /*VL=*/N->getOperand(5)};
+
+ auto *OldMemSD = cast<MemIntrinsicSDNode>(N);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = OldMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT,
+ MMO);
+ }
}
}
case ISD::EXPERIMENTAL_VP_REVERSE:
@@ -20822,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+ case RISCVISD::TUPLE_EXTRACT: {
+ EVT VT = N->getValueType(0);
+ SDValue Tuple = N->getOperand(0);
+ unsigned Idx = N->getConstantOperandVal(1);
+ if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ break;
+
+ unsigned NF = 0;
+ switch (Tuple.getConstantOperandVal(1)) {
+ default:
+ break;
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask:
+ NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+ break;
+ }
+
+ if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
+ break;
+
+ unsigned SEW = VT.getScalarSizeInBits();
+ assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/Tuple.getOperand(0),
+ /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
+ /*Passthru=*/Tuple.getOperand(2),
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/Tuple.getOperand(4),
+ /*VL=*/Tuple.getOperand(5),
+ /*Policy=*/Tuple.getOperand(6)};
+
+ auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList({VT, MVT::Other});
+ SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
+ Ops, MemVT, MMO);
+ DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
+ return Result.getValue(0);
+ }
+ case RISCVISD::TUPLE_INSERT: {
+ // tuple_insert tuple, undef, idx -> tuple
+ if (N->getOperand(1).isUndef())
+ return N->getOperand(0);
+ break;
+ }
}
return SDValue();
@@ -22346,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
+ case CallingConv::PreserveMost:
case CallingConv::GRAAL:
case CallingConv::RISCV_VectorCall:
#define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN:
@@ -22615,8 +22725,14 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsVarArg = CLI.IsVarArg;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
+ const CallBase *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction::CallSiteInfo CSInfo;
+
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -22874,6 +22990,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
if (CLI.CFIType)
Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
+ if (MF.getTarget().Options.EmitCallGraphSection && CB &&
+ CB->isIndirectCall())
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
}
@@ -22881,6 +23000,10 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
if (CLI.CFIType)
Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
+
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
Glue = Chain.getValue(1);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index dd365cf..8297d50 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtP] in {
+let IsSignExtendingOpW = 1 in
def CLS : Unary_r<0b011000000011, 0b001, "cls">;
def ABS : Unary_r<0b011000000111, 0b001, "abs">;
} // Predicates = [HasStdExtP]
@@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in {
def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
+let IsSignExtendingOpW = 1 in {
def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
+}
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 6afc942d..03e6f43 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1510,21 +1510,6 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
let VLMul = MInfo.value;
}
-class VPseudoTernaryNoMask<VReg RetClass,
- RegisterClass Op1Class,
- DAGOperand Op2Class,
- string Constraint> :
- RISCVVPseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew)> {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let Constraints = !interleave([Constraint, "$rd = $rs3"], ",");
- let HasVLOp = 1;
- let HasSEWOp = 1;
-}
-
class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f391300..5265613 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in {
def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">;
def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">;
- let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
- def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs),
- (ins uimm5nonzero:$imm),
- "qc.c.delay", "$imm"> {
- let Inst{12} = 0;
- let Inst{11-7} = 0;
- let Inst{6-2} = imm{4-0};
- }
+ // qc.c.delay implemented as an alias, below
} // Predicates = [HasVendorXqcisync, IsRV32]
let Predicates = [HasVendorXqcisim, IsRV32] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
- def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10),
- "qc.psyscalli", "$imm10"> {
- bits<10> imm10;
-
- let rs1 = 0;
- let rd = 0;
- let imm12 = {0b00, imm10};
- }
-
def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8),
"qc.pputci", "$imm8"> {
bits<8> imm8;
@@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
let imm12 = {0b0100, imm8};
}
- def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">;
- def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">;
- def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">;
- def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">;
- def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">;
- def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">;
- def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">;
-
- def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> {
- let rd = 0;
- let imm = 0;
- }
+ // The other instructions are all implemented as aliases, below
} // mayLoad = 0, mayStore = 0, hasSideEffects = 1
} // Predicates = [HasVendorXqcisim, IsRV32]
@@ -1218,6 +1191,27 @@ let EmitPriority = 0 in {
} // EmitPriority = 0
} // Predicates = [HasVendorXqcilo, IsRV32]
+let Predicates = [HasVendorXqcisim, IsRV32] in {
+let EmitPriority = 1 in {
+ def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>;
+
+ def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>;
+ def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>;
+ def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>;
+ def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>;
+ def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>;
+ def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>;
+ def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>;
+ def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>;
+} // EmitPriority = 1
+} // Predicates = [HasVendorXqcisim, IsRV32]
+
+let Predicates = [HasVendorXqcisync, IsRV32] in {
+let EmitPriority = 1 in {
+ def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>;
+}
+} // Predicates = [HasVendorXqcisync, IsRV32]
+
//===----------------------------------------------------------------------===//
// Pseudo-instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 30d8f85..726920e 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- // If the segment load is going to be performed segment at a time anyways
- // and there's only one element used, use a strided load instead. This
- // will be equally fast, and create less vector register pressure.
- if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
- unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
- // the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
- {VTy, BasePtr->getType(), Stride->getType()},
- {BasePtr, Stride, Mask, VL});
- Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
- CI->addParamAttr(0,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- Shuffles[0]->replaceAllUsesWith(CI);
- return true;
- };
-
CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
@@ -289,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- unsigned Index;
- // If the segment store only has one active lane (i.e. the interleave is
- // just a spread shuffle), we can use a strided store instead. This will
- // be equally fast, and create less vector register pressure.
- if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
- isSpreadMask(Mask, Factor, Index)) {
- unsigned ScalarSizeInBytes =
- DL.getTypeStoreSize(ShuffleVTy->getElementType());
- Value *Data = SVI->getOperand(0);
- Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at
- // most the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
- {VTy, BasePtr->getType(), Stride->getType()},
- {Data, BasePtr, Stride, LaneMask, VL});
- Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
- CI->addParamAttr(1,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- return true;
- }
-
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 5404123..7e58b6f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
return CSR_NoRegs_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+ return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList
+ : CSR_RT_MostRegs_SaveList;
if (MF->getFunction().hasFnAttribute("interrupt")) {
if (Subtarget.hasVInstructions()) {
if (Subtarget.hasStdExtD())
@@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int64_t Val = Offset.getFixed();
int64_t Lo12 = SignExtend64<12>(Val);
unsigned Opc = MI.getOpcode();
+
if (Opc == RISCV::ADDI && !isInt<12>(Val)) {
// We chose to emit the canonical immediate sequence rather than folding
// the offset into the using add under the theory that doing so doesn't
@@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
(Lo12 & 0b11111) != 0) {
// Prefetch instructions require the offset to be 32 byte aligned.
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+ } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) {
+ // MIPS Prefetch instructions require the offset to be 9 bits encoded.
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
} else if ((Opc == RISCV::PseudoRV32ZdinxLD ||
Opc == RISCV::PseudoRV32ZdinxSD) &&
Lo12 >= 2044) {
@@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
if (CC == CallingConv::GHC)
return CSR_NoRegs_RegMask;
- switch (Subtarget.getTargetABI()) {
+ RISCVABI::ABI ABI = Subtarget.getTargetABI();
+ if (CC == CallingConv::PreserveMost) {
+ if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+ return CSR_RT_MostRegs_RVE_RegMask;
+ return CSR_RT_MostRegs_RegMask;
+ }
+ switch (ABI) {
default:
llvm_unreachable("Unrecognized ABI");
case RISCVABI::ABI_ILP32E:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fd634b5..61dbd06 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{
{Intrinsic::roundeven, MVT::f64, 9},
{Intrinsic::rint, MVT::f32, 7},
{Intrinsic::rint, MVT::f64, 7},
- {Intrinsic::lrint, MVT::i32, 1},
- {Intrinsic::lrint, MVT::i64, 1},
- {Intrinsic::llrint, MVT::i64, 1},
{Intrinsic::nearbyint, MVT::f32, 9},
{Intrinsic::nearbyint, MVT::f64, 9},
{Intrinsic::bswap, MVT::i16, 3},
@@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
switch (ICA.getID()) {
case Intrinsic::lrint:
case Intrinsic::llrint:
- // We can't currently lower half or bfloat vector lrint/llrint.
- if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
- VecTy && VecTy->getElementType()->is16bitFPTy())
- return InstructionCost::getInvalid();
- [[fallthrough]];
+ case Intrinsic::lround:
+ case Intrinsic::llround: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ Type *SrcTy = ICA.getArgTypes().front();
+ auto SrcLT = getTypeLegalizationCost(SrcTy);
+ if (ST->hasVInstructions() && LT.second.isVector()) {
+ SmallVector<unsigned, 2> Ops;
+ unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
+ unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
+ if (LT.second.getVectorElementType() == MVT::bf16) {
+ if (!ST->hasVInstructionsBF16Minimal())
+ return InstructionCost::getInvalid();
+ if (DstEltSz == 32)
+ Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
+ else
+ Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
+ } else if (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16()) {
+ if (!ST->hasVInstructionsF16Minimal())
+ return InstructionCost::getInvalid();
+ if (DstEltSz == 32)
+ Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
+ else
+ Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
+
+ } else if (SrcEltSz > DstEltSz) {
+ Ops = {RISCV::VFNCVT_X_F_W};
+ } else if (SrcEltSz < DstEltSz) {
+ Ops = {RISCV::VFWCVT_X_F_V};
+ } else {
+ Ops = {RISCV::VFCVT_X_F_V};
+ }
+
+ // We need to use the source LMUL in the case of a narrowing op, and the
+ // destination LMUL otherwise.
+ if (SrcEltSz > DstEltSz)
+ return SrcLT.first *
+ getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
+ return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
+ }
+ break;
+ }
case Intrinsic::ceil:
case Intrinsic::floor:
case Intrinsic::trunc:
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index c1cc19b..050de3d 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -646,8 +646,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
if (!Src || Src->hasUnmodeledSideEffects() ||
Src->getParent() != MI.getParent() ||
!RISCVII::isFirstDefTiedToFirstUse(Src->getDesc()) ||
- !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
- !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags))
+ !RISCVII::hasVLOp(Src->getDesc().TSFlags))
return false;
// Src's dest needs to have the same EEW as MI's input.
@@ -681,12 +680,14 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
*Src->getParent()->getParent()));
}
- // If MI was tail agnostic and the VL didn't increase, preserve it.
- int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
- if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) &&
- RISCV::isVLKnownLE(MI.getOperand(3), SrcVL))
- Policy |= RISCVVType::TAIL_AGNOSTIC;
- Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
+ if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) {
+ // If MI was tail agnostic and the VL didn't increase, preserve it.
+ int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+ if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) &&
+ RISCV::isVLKnownLE(MI.getOperand(3), SrcVL))
+ Policy |= RISCVVType::TAIL_AGNOSTIC;
+ Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
+ }
MRI->constrainRegClass(Src->getOperand(0).getReg(),
MRI->getRegClass(MI.getOperand(0).getReg()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 5cda6a0..7505507 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass {
// Returns the loaded value.
Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType,
FixedVectorType *TargetType, Value *Source) {
- // We expect the codegen to avoid doing implicit bitcast from a load.
- assert(TargetType->getElementType() == SourceType->getElementType());
- assert(TargetType->getNumElements() < SourceType->getNumElements());
-
+ assert(TargetType->getNumElements() <= SourceType->getNumElements());
LoadInst *NewLoad = B.CreateLoad(SourceType, Source);
buildAssignType(B, SourceType, NewLoad);
+ Value *AssignValue = NewLoad;
+ if (TargetType->getElementType() != SourceType->getElementType()) {
+ AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast,
+ {TargetType, SourceType}, {NewLoad});
+ buildAssignType(B, TargetType, AssignValue);
+ }
SmallVector<int> Mask(/* Size= */ TargetType->getNumElements());
for (unsigned I = 0; I < TargetType->getNumElements(); ++I)
Mask[I] = I;
- Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask);
+ Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask);
buildAssignType(B, TargetType, Output);
return Output;
}
@@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass {
Output = loadFirstValueFromAggregate(B, SVT->getElementType(),
OriginalOperand, LI);
}
- // Destination is a smaller vector than source.
+ // Destination is a smaller vector than source or different vector type.
// - float3 v3 = vector4;
+ // - float4 v2 = int4;
else if (SVT && DVT)
Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand);
// Destination is the scalar type stored at the start of an aggregate.
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 721f64a..1995e0f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal();
}
+ getActionDefinitionsBuilder(G_IS_FPCLASS).custom();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType,
bool SPIRVLegalizerInfo::legalizeCustom(
LegalizerHelper &Helper, MachineInstr &MI,
LostDebugLocObserver &LocObserver) const {
- auto Opc = MI.getOpcode();
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- if (Opc == TargetOpcode::G_ICMP) {
+ switch (MI.getOpcode()) {
+ default:
+ // TODO: implement legalization for other opcodes.
+ return true;
+ case TargetOpcode::G_IS_FPCLASS:
+ return legalizeIsFPClass(Helper, MI, LocObserver);
+ case TargetOpcode::G_ICMP: {
assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg()));
auto &Op0 = MI.getOperand(2);
auto &Op1 = MI.getOperand(3);
@@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom(
}
return true;
}
- // TODO: implement legalization for other opcodes.
+ }
+}
+
+// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted
+// to ensure that all instructions created during the lowering have SPIR-V types
+// assigned to them.
+bool SPIRVLegalizerInfo::legalizeIsFPClass(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
+ auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+ FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
+
+ auto &MIRBuilder = Helper.MIRBuilder;
+ auto &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ Type *LLVMDstTy =
+ IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits());
+ if (DstTy.isVector())
+ LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount());
+ SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType(
+ LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+
+ unsigned BitSize = SrcTy.getScalarSizeInBits();
+ const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
+
+ LLT IntTy = LLT::scalar(BitSize);
+ Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize);
+ if (SrcTy.isVector()) {
+ IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
+ LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount());
+ }
+ SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType(
+ LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+
+ // Clang doesn't support capture of structured bindings:
+ LLT DstTyCopy = DstTy;
+ const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) {
+ // Assign this MI's (assumed only) destination to one of the two types we
+ // expect: either the G_IS_FPCLASS's destination type, or the integer type
+ // bitcast from the source type.
+ LLT MITy = MRI.getType(MI.getReg(0));
+ assert((MITy == IntTy || MITy == DstTyCopy) &&
+ "Unexpected LLT type while lowering G_IS_FPCLASS");
+ auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy;
+ GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF);
+ return MI;
+ };
+
+ // Helper to build and assign a constant in one go
+ const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder {
+ if (!Ty.isFixedVector())
+ return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C));
+ auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C);
+ assert((Ty == IntTy || Ty == DstTyCopy) &&
+ "Unexpected LLT type while lowering constant for G_IS_FPCLASS");
+ SPIRVType *VecEltTy = GR->getOrCreateSPIRVType(
+ (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder,
+ SPIRV::AccessQualifier::ReadWrite,
+ /*EmitIR*/ true);
+ GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF);
+ return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC));
+ };
+
+ if (Mask == fcNone) {
+ MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0));
+ MI.eraseFromParent();
+ return true;
+ }
+ if (Mask == fcAllFlags) {
+ MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1));
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Note that rather than creating a COPY here (between a floating-point and
+ // integer type of the same size) we create a SPIR-V bitcast immediately. We
+ // can't create a G_BITCAST because the LLTs are the same, and we can't seem
+ // to correctly lower COPYs to SPIR-V bitcasts at this moment.
+ Register ResVReg = MRI.createGenericVirtualRegister(IntTy);
+ MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy));
+ GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF());
+ auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast)
+ .addDef(ResVReg)
+ .addUse(GR->getSPIRVTypeID(SPIRVIntTy))
+ .addUse(SrcReg);
+ AsInt = assignSPIRVTy(std::move(AsInt));
+
+ // Various masks.
+ APInt SignBit = APInt::getSignMask(BitSize);
+ APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign.
+ APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
+ APInt ExpMask = Inf;
+ APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
+ APInt QNaNBitMask =
+ APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
+ APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
+
+ auto SignBitC = buildSPIRVConstant(IntTy, SignBit);
+ auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask);
+ auto InfC = buildSPIRVConstant(IntTy, Inf);
+ auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask);
+ auto ZeroC = buildSPIRVConstant(IntTy, 0);
+
+ auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC));
+ auto Sign = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs));
+
+ auto Res = buildSPIRVConstant(DstTy, 0);
+
+ const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) {
+ Res = assignSPIRVTy(
+ MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend))));
+ };
+
+ // Tests that involve more than one class should be processed first.
+ if ((Mask & fcFinite) == fcFinite) {
+ // finite(V) ==> abs(V) u< exp_mask
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
+ ExpMaskC));
+ Mask &= ~fcFinite;
+ } else if ((Mask & fcFinite) == fcPosFinite) {
+ // finite(V) && V > 0 ==> V u< exp_mask
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
+ ExpMaskC));
+ Mask &= ~fcPosFinite;
+ } else if ((Mask & fcFinite) == fcNegFinite) {
+ // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
+ auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT,
+ DstTy, Abs, ExpMaskC));
+ appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign));
+ Mask &= ~fcNegFinite;
+ }
+
+ if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
+ // fcZero | fcSubnormal => test all exponent bits are 0
+ // TODO: Handle sign bit specific cases
+ // TODO: Handle inverted case
+ if (PartialCheck == (fcZero | fcSubnormal)) {
+ auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC));
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ ExpBits, ZeroC));
+ Mask &= ~PartialCheck;
+ }
+ }
+
+ // Check for individual classes.
+ if (FPClassTest PartialCheck = Mask & fcZero) {
+ if (PartialCheck == fcPosZero)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, ZeroC));
+ else if (PartialCheck == fcZero)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
+ else // fcNegZero
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, SignBitC));
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcSubnormal) {
+ // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
+ // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
+ auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
+ auto OneC = buildSPIRVConstant(IntTy, 1);
+ auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
+ auto SubnormalRes = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
+ buildSPIRVConstant(IntTy, AllOneMantissa)));
+ if (PartialCheck == fcNegSubnormal)
+ SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
+ appendToRes(std::move(SubnormalRes));
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcInf) {
+ if (PartialCheck == fcPosInf)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, InfC));
+ else if (PartialCheck == fcInf)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
+ else { // fcNegInf
+ APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
+ auto NegInfC = buildSPIRVConstant(IntTy, NegInf);
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+ AsInt, NegInfC));
+ }
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcNan) {
+ auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+ if (PartialCheck == fcNan) {
+ // isnan(V) ==> abs(V) u> int(inf)
+ appendToRes(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
+ } else if (PartialCheck == fcQNan) {
+ // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
+ appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
+ InfWithQnanBitC));
+ } else { // fcSNan
+ // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
+ // abs(V) u< (unsigned(Inf) | quiet_bit)
+ auto IsNan = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
+ auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp(
+ CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC));
+ appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
+ }
+ }
+
+ if (FPClassTest PartialCheck = Mask & fcNormal) {
+ // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
+ // (max_exp-1))
+ APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
+ auto ExpMinusOne = assignSPIRVTy(
+ MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
+ APInt MaxExpMinusOne = ExpMask - ExpLSB;
+ auto NormalRes = assignSPIRVTy(
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
+ buildSPIRVConstant(IntTy, MaxExpMinusOne)));
+ if (PartialCheck == fcNegNormal)
+ NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
+ else if (PartialCheck == fcPosNormal) {
+ auto PosSign = assignSPIRVTy(MIRBuilder.buildXor(
+ DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask)));
+ NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
+ }
+ appendToRes(std::move(NormalRes));
+ }
+
+ MIRBuilder.buildCopy(DstReg, Res);
+ MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
index 6335f21..eeefa42 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
@@ -30,6 +30,10 @@ public:
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
LostDebugLocObserver &LocObserver) const override;
SPIRVLegalizerInfo(const SPIRVSubtarget &ST);
+
+private:
+ bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 43bf6e9..60c4e2d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -59,6 +59,8 @@ public:
Intrinsic::ID IID) const override;
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;
+
+ bool allowVectorElementIndexingUsingGEP() const override { return false; }
};
} // namespace llvm
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index e30d723..fb0a47d 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9044,7 +9044,7 @@ static unsigned detectEvenOddMultiplyOperand(const SelectionDAG &DAG,
if (unsigned(ShuffleMask[Elt]) != 2 * Elt)
CanUseEven = false;
if (unsigned(ShuffleMask[Elt]) != 2 * Elt + 1)
- CanUseEven = true;
+ CanUseOdd = false;
}
Op = Op.getOperand(0);
if (CanUseEven)
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index a606209..089be5f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -49,6 +49,8 @@ def FeatureFP16 :
SubtargetFeature<"fp16", "HasFP16", "true",
"Enable FP16 instructions">;
+def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">;
+
def FeatureMultiMemory :
SubtargetFeature<"multimemory", "HasMultiMemory", "true",
"Enable multiple memories">;
@@ -71,7 +73,6 @@ def FeatureReferenceTypes :
SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
"Enable reference types">;
-def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">;
def FeatureRelaxedSIMD :
SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
"Enable relaxed-simd instructions">;
@@ -139,10 +140,10 @@ def : ProcessorModel<"lime1", NoSchedModel,
def : ProcessorModel<"bleeding-edge", NoSchedModel,
[FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
FeatureCallIndirectOverlong, FeatureExceptionHandling,
- FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
- FeatureMultivalue, FeatureMutableGlobals,
+ FeatureExtendedConst, FeatureFP16, FeatureGC,
+ FeatureMultiMemory, FeatureMultivalue, FeatureMutableGlobals,
FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
- FeatureReferenceTypes, FeatureGC, FeatureSIMD128,
+ FeatureReferenceTypes, FeatureSIMD128,
FeatureSignExt, FeatureTailCall]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index cd434f7..3f80b2a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (VT != MVT::v8i32 && VT != MVT::v16i32)
return SDValue();
@@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performMulCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == ISD::MUL);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+ return Res;
+
+ // We don't natively support v16i8 mul, but we do support v8i16 so split the
+ // inputs and extend them to v8i16. Only do this before legalization in case
+ // a narrow vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+ SDValue MulHigh = DAG.getBitcast(
+ VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::MUL:
- return performMulCombine(N, DCI.DAG);
+ return performMulCombine(N, DCI);
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 2b632fd..13d048a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -50,6 +50,9 @@ def HasFP16 :
Predicate<"Subtarget->hasFP16()">,
AssemblerPredicate<(all_of FeatureFP16), "fp16">;
+def HasGC : Predicate<"Subtarget->hasGC()">,
+ AssemblerPredicate<(all_of FeatureGC), "gc">;
+
def HasMultiMemory :
Predicate<"Subtarget->hasMultiMemory()">,
AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">;
@@ -76,9 +79,6 @@ def HasReferenceTypes :
Predicate<"Subtarget->hasReferenceTypes()">,
AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
-def HasGC : Predicate<"Subtarget->hasGC()">,
- AssemblerPredicate<(all_of FeatureGC), "gc">;
-
def HasRelaxedSIMD :
Predicate<"Subtarget->hasRelaxedSIMD()">,
AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d13862f..143298b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
(!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
}
defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 28f6599..c3990d1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
for (Instruction &I : BB) {
if (I.getType()->isVoidTy())
continue;
+
+ if (isa<AllocaInst>(&I)) {
+ // If the alloca has any lifetime marker that is no longer dominated
+ // by the alloca, remove all lifetime markers. Lifetime markers must
+ // always work directly on the alloca, and this is no longer possible.
+ bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) {
+ auto *UserI = cast<Instruction>(U);
+ return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI);
+ });
+ if (HasNonDominatedLifetimeMarker) {
+ for (User *U : make_early_inc_range(I.users())) {
+ auto *UserI = cast<Instruction>(U);
+ if (UserI->isLifetimeStartOrEnd())
+ UserI->eraseFromParent();
+ }
+ }
+ }
+
unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
// If a value is defined by an invoke instruction, it is only available in
// its normal destination and not in its unwind destination.
@@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
// Setjmp preparation
+ SmallVector<AllocaInst *> StaticAllocas;
+ for (Instruction &I : F.getEntryBlock())
+ if (auto *AI = dyn_cast<AllocaInst>(&I))
+ if (AI->isStaticAlloca())
+ StaticAllocas.push_back(AI);
+
BasicBlock *Entry = &F.getEntryBlock();
DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
SplitBlock(Entry, &*Entry->getFirstInsertionPt());
+ // Move static allocas back into the entry block, so they stay static.
+ for (AllocaInst *AI : StaticAllocas)
+ AI->moveBefore(Entry->getTerminator()->getIterator());
+
IRB.SetInsertPoint(Entry->getTerminator()->getIterator());
// This alloca'ed pointer is used by the runtime to identify function
// invocations. It's just for pointer comparisons. It will never be
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index f814274..2f88bbb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -46,12 +46,12 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
bool HasExceptionHandling = false;
bool HasExtendedConst = false;
bool HasFP16 = false;
+ bool HasGC = false;
bool HasMultiMemory = false;
bool HasMultivalue = false;
bool HasMutableGlobals = false;
bool HasNontrappingFPToInt = false;
bool HasReferenceTypes = false;
- bool HasGC = false;
bool HasSignExt = false;
bool HasTailCall = false;
bool HasWideArithmetic = false;
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 1bf9f8b..f9bd233 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources}
IRPrinter
Instrumentation
MC
+ ObjCARC
ProfileData
Scalar
SelectionDAG
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 0ff7f23..067bd43 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3673,6 +3673,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CLI.NumResultRegs = RVLocs.size();
CLI.Call = MIB;
+ // Add call site info for call graph section.
+ if (TM.Options.EmitCallGraphSection && CB && CB->isIndirectCall()) {
+ MachineFunction::CallSiteInfo CSInfo(*CB);
+ MF->addCallSiteInfo(CLI.Call, std::move(CSInfo));
+ }
+
return true;
}
@@ -4042,6 +4048,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
MO.setReg(IndexReg);
}
+ if (MI->isCall())
+ FuncInfo.MF->moveAdditionalCallInfo(MI, Result);
Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
MachineBasicBlock::iterator I(MI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 11ab8dc..bbbb1d9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58071,14 +58071,24 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
Ops[3] = Op1.getOperand(0);
Ops[4] = Op1.getOperand(1);
} else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
+ SDValue Src = Op1;
+ SDValue Op10 = Op1.getOperand(0);
+ if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
+ // res, flags2 = sub 0, (and (xor X, -1), Y)
+ // cload/cstore ..., cond_ne, flag2
+ // ->
+ // res, flags2 = sub 0, (and X, Y)
+ // cload/cstore ..., cond_e, flag2
+ Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
+ Op1.getOperand(1));
+ Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
+ }
// res, flags2 = sub 0, (and X, Y)
- // cload/cstore ..., cond_ne, flag2
+ // cload/cstore ..., cc, flag2
// ->
- // res, flags2 = and X, Y
- // cload/cstore ..., cond_ne, flag2
- Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0),
- Op1.getOperand(1))
- .getValue(1);
+ // res, flags2 = cmp (and X, Y), 0
+ // cload/cstore ..., cc, flag2
+ Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
} else {
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b4639ac..5862c7e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2060,6 +2060,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
+
if (IsIndirectCall && !IsWin64 &&
M->getModuleFlag("import-call-optimization"))
errorUnsupported(DAG, dl,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 78bd5b4..7e09d30 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -587,8 +587,9 @@ StringRef sys::detail::getHostCPUNameForBPF() {
#endif
}
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
- defined(_M_X64)
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+ defined(_M_X64)) && \
+ !defined(_M_ARM64EC)
/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
/// the specified arguments. If we can't run cpuid on the host, return true.
@@ -1853,8 +1854,9 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
} // namespace llvm
#endif
-#if defined(__i386__) || defined(_M_IX86) || \
- defined(__x86_64__) || defined(_M_X64)
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+ defined(_M_X64)) && \
+ !defined(_M_ARM64EC)
StringMap<bool> sys::getHostCPUFeatures() {
unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
unsigned MaxLevel;
@@ -2147,7 +2149,8 @@ StringMap<bool> sys::getHostCPUFeatures() {
return Features;
}
-#elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64))
+#elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64) || \
+ defined(__arm64ec__) || defined(_M_ARM64EC))
StringMap<bool> sys::getHostCPUFeatures() {
StringMap<bool> Features;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index e5c896f..126be71 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -446,6 +446,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["tanh-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
+ Features["bf16-cvt-insts"] = true;
Features["fp8-conversion-insts"] = true;
Features["fp8e5m3-insts"] = true;
Features["permlane16-swap"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ee6651c..6acb0bc 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
case PC: return "pc";
case SCEI: return "scei";
case SUSE: return "suse";
+ case Meta:
+ return "meta";
}
llvm_unreachable("Invalid VendorType!");
@@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
case OpenHOS: return "ohos";
case PAuthTest:
return "pauthtest";
+ case MTIA:
+ return "mtia";
case LLVM:
return "llvm";
case Mlibc:
@@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
.Case("suse", Triple::SUSE)
.Case("oe", Triple::OpenEmbedded)
.Case("intel", Triple::Intel)
+ .Case("meta", Triple::Meta)
.Default(Triple::UnknownVendor);
}
@@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
.StartsWith("pauthtest", Triple::PAuthTest)
.StartsWith("llvm", Triple::LLVM)
.StartsWith("mlibc", Triple::Mlibc)
+ .StartsWith("mtia", Triple::MTIA)
.Default(Triple::UnknownEnvironment);
}
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7af5ba4..40a7f80 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -458,29 +458,19 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
// Check if this array of constants represents a cttz table.
// Iterate over the elements from \p Table by trying to find/match all
// the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
- uint64_t Shift, uint64_t InputBits) {
- unsigned Length = Table.getNumElements();
- if (Length < InputBits || Length > InputBits * 2)
- return false;
-
- APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);
- unsigned Matched = 0;
-
- for (unsigned i = 0; i < Length; i++) {
- uint64_t Element = Table.getElementAsInteger(i);
- if (Element >= InputBits)
- continue;
-
- // Check if \p Element matches a concrete answer. It could fail for some
- // elements that are never accessed, so we keep iterating over each element
- // from the table. The number of matched elements should be equal to the
- // number of potential right answers which is \p InputBits actually.
- if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
- Matched++;
+static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
+ const APInt &AndMask, Type *AccessTy,
+ unsigned InputBits, const APInt &GEPIdxFactor,
+ const DataLayout &DL) {
+ for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+ APInt Index = (APInt(InputBits, 1).shl(Idx) * Mul).lshr(Shift) & AndMask;
+ ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+ ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+ if (!C || C->getValue() != Idx)
+ return false;
}
- return Matched == InputBits;
+ return true;
}
// Try to recognize table-based ctz implementation.
@@ -495,6 +485,11 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
// this can be lowered to `cttz` instruction.
// There is also a special case when the element is 0.
//
+// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
+// sequence that contains each pattern of bits in it. The shift extracts
+// the top bits after the multiply, and that index into the table should
+// represent the number of trailing zeros in the original number.
+//
// Here are some examples or LLVM IR for a 64-bit target:
//
// CASE 1:
@@ -536,8 +531,8 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
// i64 %shr
// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
//
-// All this can be lowered to @llvm.cttz.i32/64 intrinsic.
-static bool tryToRecognizeTableBasedCttz(Instruction &I) {
+// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
+static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
LoadInst *LI = dyn_cast<LoadInst>(&I);
if (!LI)
return false;
@@ -547,53 +542,47 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
return false;
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
- if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
- return false;
-
- if (!GEP->getSourceElementType()->isArrayTy())
- return false;
-
- uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
- if (ArraySize != 32 && ArraySize != 64)
+ if (!GEP || !GEP->hasNoUnsignedSignedWrap())
return false;
GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
return false;
- ConstantDataArray *ConstData =
- dyn_cast<ConstantDataArray>(GVTable->getInitializer());
- if (!ConstData)
- return false;
-
- if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+ unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+ APInt ModOffset(BW, 0);
+ SmallMapVector<Value *, APInt, 4> VarOffsets;
+ if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+ VarOffsets.size() != 1 || ModOffset != 0)
return false;
+ auto [GepIdx, GEPScale] = VarOffsets.front();
- Value *Idx2 = std::next(GEP->idx_begin())->get();
Value *X1;
- uint64_t MulConst, ShiftConst;
- // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
- // probably fail for other (e.g. 32-bit) targets.
- if (!match(Idx2, m_ZExtOrSelf(
- m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
- m_ConstantInt(MulConst)),
- m_ConstantInt(ShiftConst)))))
+ const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+ // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
+ // This might be extended to the pointer index type, and if the gep index type
+ // has been replaced with an i8 then a new And (and different ShiftConst) will
+ // be present.
+ auto MatchInner = m_LShr(
+ m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
+ m_APInt(ShiftConst));
+ if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
+ !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
return false;
unsigned InputBits = X1->getType()->getScalarSizeInBits();
- if (InputBits != 32 && InputBits != 64)
- return false;
-
- // Shift should extract top 5..7 bits.
- if (InputBits - Log2_32(InputBits) != ShiftConst &&
- InputBits - Log2_32(InputBits) - 1 != ShiftConst)
+ if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
return false;
- if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
+ if (!GEPScale.isIntN(InputBits) ||
+ !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
+ AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+ InputBits, GEPScale.zextOrTrunc(InputBits), DL))
return false;
- auto ZeroTableElem = ConstData->getElementAsInteger(0);
- bool DefinedForZero = ZeroTableElem == InputBits;
+ ConstantInt *ZeroTableElem = cast<ConstantInt>(
+ ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+ bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
IRBuilder<> B(LI);
ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
@@ -607,8 +596,7 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
// If the value in elem 0 isn't the same as InputBits, we still want to
// produce the value from the table.
auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
- auto Select =
- B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);
+ auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
// NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
// it should be handled as: `cttz(x) & (typeSize - 1)`.
@@ -1477,7 +1465,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= foldGuardedFunnelShift(I, DT);
MadeChange |= tryToRecognizePopCount(I);
MadeChange |= tryToFPToSat(I, TTI);
- MadeChange |= tryToRecognizeTableBasedCttz(I);
+ MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 64b33e4..ab906f9 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1568,7 +1568,7 @@ private:
if (DebugLoc SuspendLoc = S->getDebugLoc()) {
std::string LabelName =
("__coro_resume_" + Twine(SuspendIndex)).str();
- DILocation &DILoc = *SuspendLoc.get();
+ DILocation &DILoc = *SuspendLoc;
DILabel *ResumeLabel =
DBuilder.createLabel(DIS, LabelName, DILoc.getFile(),
SuspendLoc.getLine(), SuspendLoc.getCol(),
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index b3910c4..d895cd7 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -37,6 +37,16 @@
// memory that ends up in one of the runtime equivalents, since this can
// happen if e.g. a library that was compiled without interposition returns
// an allocation that can be validly passed to `free`.
+//
+// 3. MathFixup (required): Some accelerators might have an incomplete
+// implementation for the intrinsics used to implement some of the math
+// functions in <cmath> / their corresponding libcall lowerings. Since this
+// can vary quite significantly between accelerators, we replace calls to a
+// set of intrinsics / lib functions known to be problematic with calls to a
+// HIPSTDPAR specific forwarding layer, which gives an uniform interface for
+// accelerators to implement in their own runtime components. This pass
+// should run before AcceleratorCodeSelection so as to prevent the spurious
+// removal of the HIPSTDPAR specific forwarding functions.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
@@ -49,6 +59,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
return PreservedAnalyses::none();
}
+
+static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{
+ {"acosh", "__hipstdpar_acosh_f64"},
+ {"acoshf", "__hipstdpar_acosh_f32"},
+ {"asinh", "__hipstdpar_asinh_f64"},
+ {"asinhf", "__hipstdpar_asinh_f32"},
+ {"atanh", "__hipstdpar_atanh_f64"},
+ {"atanhf", "__hipstdpar_atanh_f32"},
+ {"cbrt", "__hipstdpar_cbrt_f64"},
+ {"cbrtf", "__hipstdpar_cbrt_f32"},
+ {"erf", "__hipstdpar_erf_f64"},
+ {"erff", "__hipstdpar_erf_f32"},
+ {"erfc", "__hipstdpar_erfc_f64"},
+ {"erfcf", "__hipstdpar_erfc_f32"},
+ {"fdim", "__hipstdpar_fdim_f64"},
+ {"fdimf", "__hipstdpar_fdim_f32"},
+ {"expm1", "__hipstdpar_expm1_f64"},
+ {"expm1f", "__hipstdpar_expm1_f32"},
+ {"hypot", "__hipstdpar_hypot_f64"},
+ {"hypotf", "__hipstdpar_hypot_f32"},
+ {"ilogb", "__hipstdpar_ilogb_f64"},
+ {"ilogbf", "__hipstdpar_ilogb_f32"},
+ {"lgamma", "__hipstdpar_lgamma_f64"},
+ {"lgammaf", "__hipstdpar_lgamma_f32"},
+ {"log1p", "__hipstdpar_log1p_f64"},
+ {"log1pf", "__hipstdpar_log1p_f32"},
+ {"logb", "__hipstdpar_logb_f64"},
+ {"logbf", "__hipstdpar_logb_f32"},
+ {"nextafter", "__hipstdpar_nextafter_f64"},
+ {"nextafterf", "__hipstdpar_nextafter_f32"},
+ {"nexttoward", "__hipstdpar_nexttoward_f64"},
+ {"nexttowardf", "__hipstdpar_nexttoward_f32"},
+ {"remainder", "__hipstdpar_remainder_f64"},
+ {"remainderf", "__hipstdpar_remainder_f32"},
+ {"remquo", "__hipstdpar_remquo_f64"},
+ {"remquof", "__hipstdpar_remquo_f32"},
+ {"scalbln", "__hipstdpar_scalbln_f64"},
+ {"scalblnf", "__hipstdpar_scalbln_f32"},
+ {"scalbn", "__hipstdpar_scalbn_f64"},
+ {"scalbnf", "__hipstdpar_scalbn_f32"},
+ {"tgamma", "__hipstdpar_tgamma_f64"},
+ {"tgammaf", "__hipstdpar_tgamma_f32"}};
+
+PreservedAnalyses HipStdParMathFixupPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ SmallVector<std::pair<Function *, std::string>> ToReplace;
+ for (auto &&F : M) {
+ if (!F.hasName())
+ continue;
+
+ StringRef N = F.getName();
+ Intrinsic::ID ID = F.getIntrinsicID();
+
+ switch (ID) {
+ case Intrinsic::not_intrinsic: {
+ auto It =
+ find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; });
+ if (It == std::cend(MathLibToHipStdPar))
+ continue;
+ ToReplace.emplace_back(&F, It->second);
+ break;
+ }
+ case Intrinsic::acos:
+ case Intrinsic::asin:
+ case Intrinsic::atan:
+ case Intrinsic::atan2:
+ case Intrinsic::cosh:
+ case Intrinsic::modf:
+ case Intrinsic::sinh:
+ case Intrinsic::tan:
+ case Intrinsic::tanh:
+ break;
+ default: {
+ if (F.getReturnType()->isDoubleTy()) {
+ switch (ID) {
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+ continue;
+ }
+ }
+
+ ToReplace.emplace_back(&F, N);
+ llvm::replace(ToReplace.back().second, '.', '_');
+ StringRef Prefix = "llvm";
+ ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar");
+ }
+ for (auto &&[F, NewF] : ToReplace)
+ F->replaceAllUsesWith(
+ M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee());
+
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index f43202e..8262c8c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes,
struct SCCNodesResult {
SCCNodeSet SCCNodes;
- bool HasUnknownCall;
};
} // end anonymous namespace
@@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes,
static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
SCCNodesResult Res;
- Res.HasUnknownCall = false;
for (Function *F : Functions) {
if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
F->isPresplitCoroutine()) {
- // Treat any function we're trying not to optimize as if it were an
- // indirect call and omit it from the node set used below.
- Res.HasUnknownCall = true;
+ // Omit any functions we're trying not to optimize from the set.
continue;
}
- // Track whether any functions in this SCC have an unknown call edge.
- // Note: if this is ever a performance hit, we can common it with
- // subsequent routines which also do scans over the instructions of the
- // function.
- if (!Res.HasUnknownCall) {
- for (Instruction &I : instructions(*F)) {
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- if (!CB->getCalledFunction()) {
- Res.HasUnknownCall = true;
- break;
- }
- }
- }
- }
+
Res.SCCNodes.insert(F);
}
return Res;
@@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
addColdAttrs(Nodes.SCCNodes, Changed);
addWillReturn(Nodes.SCCNodes, Changed);
addNoUndefAttrs(Nodes.SCCNodes, Changed);
-
- // If we have no external nodes participating in the SCC, we can deduce some
- // more precise attributes as well.
- if (!Nodes.HasUnknownCall) {
- addNoAliasAttrs(Nodes.SCCNodes, Changed);
- addNonNullAttrs(Nodes.SCCNodes, Changed);
- inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
- addNoRecurseAttrs(Nodes.SCCNodes, Changed);
- }
+ addNoAliasAttrs(Nodes.SCCNodes, Changed);
+ addNonNullAttrs(Nodes.SCCNodes, Changed);
+ inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+ addNoRecurseAttrs(Nodes.SCCNodes, Changed);
// Finally, infer the maximal set of attributes from the ones we've inferred
// above. This is handling the cases where one attribute on a signature
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 486205c..57844a1 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -502,8 +502,7 @@ class LowerTypeTestsModule {
uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
TypeIdLowering importTypeId(StringRef TypeId);
void importTypeTest(CallInst *CI);
- void importFunction(Function *F, bool isJumpTableCanonical,
- std::vector<GlobalAlias *> &AliasesToErase);
+ void importFunction(Function *F, bool isJumpTableCanonical);
BitSetInfo
buildBitSet(Metadata *TypeId,
@@ -1103,9 +1102,8 @@ void LowerTypeTestsModule::maybeReplaceComdat(Function *F,
// ThinLTO backend: the function F has a jump table entry; update this module
// accordingly. isJumpTableCanonical describes the type of the jump table entry.
-void LowerTypeTestsModule::importFunction(
- Function *F, bool isJumpTableCanonical,
- std::vector<GlobalAlias *> &AliasesToErase) {
+void LowerTypeTestsModule::importFunction(Function *F,
+ bool isJumpTableCanonical) {
assert(F->getType()->getAddressSpace() == 0);
GlobalValue::VisibilityTypes Visibility = F->getVisibility();
@@ -1135,23 +1133,23 @@ void LowerTypeTestsModule::importFunction(
} else {
F->setName(Name + ".cfi");
maybeReplaceComdat(F, Name);
- F->setLinkage(GlobalValue::ExternalLinkage);
FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
F->getAddressSpace(), Name, &M);
FDecl->setVisibility(Visibility);
Visibility = GlobalValue::HiddenVisibility;
- // Delete aliases pointing to this function, they'll be re-created in the
- // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
- // will want to reset the aliasees first.
+ // Update aliases pointing to this function to also include the ".cfi" suffix,
+ // We expect the jump table entry to either point to the real function or an
+ // alias. Redirect all other users to the jump table entry.
for (auto &U : F->uses()) {
if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
+ std::string AliasName = A->getName().str() + ".cfi";
Function *AliasDecl = Function::Create(
F->getFunctionType(), GlobalValue::ExternalLinkage,
F->getAddressSpace(), "", &M);
AliasDecl->takeName(A);
A->replaceAllUsesWith(AliasDecl);
- AliasesToErase.push_back(A);
+ A->setName(AliasName);
}
}
}
@@ -2077,16 +2075,13 @@ bool LowerTypeTestsModule::lower() {
Decls.push_back(&F);
}
- std::vector<GlobalAlias *> AliasesToErase;
{
ScopedSaveAliaseesAndUsed S(M);
for (auto *F : Defs)
- importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
+ importFunction(F, /*isJumpTableCanonical*/ true);
for (auto *F : Decls)
- importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
+ importFunction(F, /*isJumpTableCanonical*/ false);
}
- for (GlobalAlias *GA : AliasesToErase)
- GA->eraseFromParent();
return true;
}
@@ -2137,6 +2132,18 @@ bool LowerTypeTestsModule::lower() {
if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get()))
AddressTaken.insert(Alias->getAliaseeGUID());
}
+ auto IsAddressTaken = [&](GlobalValue::GUID GUID) {
+ if (AddressTaken.count(GUID))
+ return true;
+ auto VI = ExportSummary->getValueInfo(GUID);
+ if (!VI)
+ return false;
+ for (auto &I : VI.getSummaryList())
+ if (auto Alias = dyn_cast<AliasSummary>(I.get()))
+ if (AddressTaken.count(Alias->getAliaseeGUID()))
+ return true;
+ return false;
+ };
for (auto *FuncMD : CfiFunctionsMD->operands()) {
assert(FuncMD->getNumOperands() >= 2);
StringRef FunctionName =
@@ -2153,7 +2160,7 @@ bool LowerTypeTestsModule::lower() {
// have no live references (and are not exported with cross-DSO CFI.)
if (!ExportSummary->isGUIDLive(GUID))
continue;
- if (!AddressTaken.count(GUID)) {
+ if (!IsAddressTaken(GUID)) {
if (!CrossDsoCfi || Linkage != CFL_Definition)
continue;
@@ -2227,6 +2234,43 @@ bool LowerTypeTestsModule::lower() {
}
}
+ struct AliasToCreate {
+ Function *Alias;
+ std::string TargetName;
+ };
+ std::vector<AliasToCreate> AliasesToCreate;
+
+ // Parse alias data to replace stand-in function declarations for aliases
+ // with an alias to the intended target.
+ if (ExportSummary) {
+ if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+ for (auto *AliasMD : AliasesMD->operands()) {
+ SmallVector<Function *> Aliases;
+ for (Metadata *MD : AliasMD->operands()) {
+ auto *MDS = dyn_cast<MDString>(MD);
+ if (!MDS)
+ continue;
+ StringRef AliasName = MDS->getString();
+ if (!ExportedFunctions.count(AliasName))
+ continue;
+ auto *AliasF = M.getFunction(AliasName);
+ if (AliasF)
+ Aliases.push_back(AliasF);
+ }
+
+ if (Aliases.empty())
+ continue;
+
+ for (unsigned I = 1; I != Aliases.size(); ++I) {
+ auto *AliasF = Aliases[I];
+ ExportedFunctions.erase(AliasF->getName());
+ AliasesToCreate.push_back(
+ {AliasF, std::string(Aliases[0]->getName())});
+ }
+ }
+ }
+ }
+
DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
for (GlobalObject &GO : M.global_objects()) {
if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
@@ -2414,47 +2458,16 @@ bool LowerTypeTestsModule::lower() {
allocateByteArrays();
- // Parse alias data to replace stand-in function declarations for aliases
- // with an alias to the intended target.
- if (ExportSummary) {
- if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
- for (auto *AliasMD : AliasesMD->operands()) {
- assert(AliasMD->getNumOperands() >= 4);
- StringRef AliasName =
- cast<MDString>(AliasMD->getOperand(0))->getString();
- StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
-
- if (auto It = ExportedFunctions.find(Aliasee);
- It == ExportedFunctions.end() ||
- It->second.Linkage != CFL_Definition || !M.getNamedAlias(Aliasee))
- continue;
-
- GlobalValue::VisibilityTypes Visibility =
- static_cast<GlobalValue::VisibilityTypes>(
- cast<ConstantAsMetadata>(AliasMD->getOperand(2))
- ->getValue()
- ->getUniqueInteger()
- .getZExtValue());
- bool Weak =
- static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
- ->getValue()
- ->getUniqueInteger()
- .getZExtValue());
-
- auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
- Alias->setVisibility(Visibility);
- if (Weak)
- Alias->setLinkage(GlobalValue::WeakAnyLinkage);
-
- if (auto *F = M.getFunction(AliasName)) {
- Alias->takeName(F);
- F->replaceAllUsesWith(Alias);
- F->eraseFromParent();
- } else {
- Alias->setName(AliasName);
- }
- }
- }
+ for (auto A : AliasesToCreate) {
+ auto *Target = M.getNamedValue(A.TargetName);
+ if (!isa<GlobalAlias>(Target))
+ continue;
+ auto *AliasGA = GlobalAlias::create("", Target);
+ AliasGA->setVisibility(A.Alias->getVisibility());
+ AliasGA->setLinkage(A.Alias->getLinkage());
+ AliasGA->takeName(A.Alias);
+ A.Alias->replaceAllUsesWith(AliasGA);
+ A.Alias->eraseFromParent();
}
// Emit .symver directives for exported functions, if they exist.
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 0164fcd..c009c1e 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
+STATISTIC(MismatchedCloneAssignments,
+ "Number of callsites assigned to call multiple non-matching clones");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -730,7 +732,7 @@ private:
/// of the functions tracked calls to their new versions in the CallMap.
/// Assigns new clones to clone number CloneNo.
FuncInfo cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
@@ -897,7 +899,7 @@ private:
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
@@ -989,7 +991,7 @@ private:
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
- std::map<CallInfo, CallInfo> &CallMap,
+ DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
@@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}
+// Return the clone number of the given function by extracting it from the
+// memprof suffix. Assumes the caller has already confirmed it is a memprof
+// clone.
+static unsigned getMemProfCloneNum(const Function &F) {
+ assert(isMemProfClone(F));
+ auto Pos = F.getName().find_last_of('.');
+ assert(Pos > 0);
+ unsigned CloneNo;
+ bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
+ assert(!Err);
+ (void)Err;
+ return CloneNo;
+}
+
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
- if (CalleeFunc.cloneNo() > 0)
+ auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ if (isMemProfClone(*CurF)) {
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
+ if (CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ }
+ if (NewCalleeCloneNo > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
@@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
- CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+ auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+ auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
+ // If we already assigned this callsite to call a specific non-default
+ // clone (i.e. not the original function which is clone 0), ensure that we
+ // aren't trying to now update it to call a different clone, which is
+ // indicative of a bug in the graph or function assignment.
+ if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
+ LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+ << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+ << "\n");
+ MismatchedCloneAssignments++;
+ }
+ CurCalleeCloneNo = NewCalleeCloneNo;
}
// Update the debug information attached to NewFunc to use the clone Name. Note
@@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Use existing LLVM facilities for cloning and obtaining Call in clone
ValueToValueMapTy VMap;
@@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
IndexCallsiteContextGraph::cloneFunctionForCallsite(
- FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Check how many clones we have of Call (and therefore function).
// The next clone number is the current size of versions array.
@@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
};
+ // Information for a single clone of this Func.
+ struct FuncCloneInfo {
+ // The function clone.
+ FuncInfo FuncClone;
+ // Remappings of each call of interest (from original uncloned call to the
+ // corresponding cloned call in this function clone).
+ DenseMap<CallInfo, CallInfo> CallMap;
+ };
+
// Walk all functions for which we saw calls with memprof metadata, and handle
// cloning for each of its calls.
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
FuncInfo OrigFunc(Func);
- // Map from each clone of OrigFunc to a map of remappings of each call of
- // interest (from original uncloned call to the corresponding cloned call in
- // that function clone).
- std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+ // Map from each clone number of OrigFunc to information about that function
+ // clone (the function clone FuncInfo and call remappings). The index into
+ // the vector is the clone number, as function clones are created and
+ // numbered sequentially.
+ std::vector<FuncCloneInfo> FuncCloneInfos;
for (auto &Call : CallsWithMetadata) {
ContextNode *Node = getNodeForInst(Call);
// Skip call if we do not have a node for it (all uses of its stack ids
@@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// Record the clone of callsite node assigned to this function clone.
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
- assert(FuncClonesToCallMap.count(FuncClone));
- std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+ assert(FuncCloneInfos.size() > FuncClone.cloneNo());
+ DenseMap<CallInfo, CallInfo> &CallMap =
+ FuncCloneInfos[FuncClone.cloneNo()].CallMap;
CallInfo CallClone(Call);
if (auto It = CallMap.find(Call); It != CallMap.end())
CallClone = It->second;
@@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// than existing function clones, which would have been assigned to an
// earlier clone in the list (we assign callsite clones to function
// clones greedily).
- if (FuncClonesToCallMap.size() < NodeCloneCount) {
+ if (FuncCloneInfos.size() < NodeCloneCount) {
// If this is the first callsite copy, assign to original function.
if (NodeCloneCount == 1) {
- // Since FuncClonesToCallMap is empty in this case, no clones have
+ // Since FuncCloneInfos is empty in this case, no clones have
// been created for this function yet, and no callers should have
// been assigned a function clone for this callee node yet.
assert(llvm::none_of(
@@ -4540,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}));
// Initialize with empty call map, assign Clone to original function
// and its callers, and skip to the next clone.
- FuncClonesToCallMap[OrigFunc] = {};
+ FuncCloneInfos.push_back(
+ {OrigFunc, DenseMap<CallInfo, CallInfo>()});
AssignCallsiteCloneToFuncClone(
OrigFunc, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
@@ -4572,14 +4627,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}
// Clone function and save it along with the CallInfo map created
- // during cloning in the FuncClonesToCallMap.
- std::map<CallInfo, CallInfo> NewCallMap;
- unsigned CloneNo = FuncClonesToCallMap.size();
+ // during cloning in the FuncCloneInfos.
+ DenseMap<CallInfo, CallInfo> NewCallMap;
+ unsigned CloneNo = FuncCloneInfos.size();
assert(CloneNo > 0 && "Clone 0 is the original function, which "
"should already exist in the map");
FuncInfo NewFuncClone = cloneFunctionForCallsite(
OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
- FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+ FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
FunctionClonesAnalysis++;
Changed = true;
@@ -4680,8 +4735,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// CallMap is set up as indexed by original Call at clone 0.
CallInfo OrigCall(Callee->getOrigNode()->Call);
OrigCall.setCloneNo(0);
- std::map<CallInfo, CallInfo> &CallMap =
- FuncClonesToCallMap[NewFuncClone];
+ DenseMap<CallInfo, CallInfo> &CallMap =
+ FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
assert(CallMap.count(OrigCall));
CallInfo NewCall(CallMap[OrigCall]);
assert(NewCall);
@@ -4703,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// where the callers were assigned to different clones of a function.
}
+ auto FindFirstAvailFuncClone = [&]() {
+ // Find first function in FuncCloneInfos without an assigned
+ // clone of this callsite Node. We should always have one
+ // available at this point due to the earlier cloning when the
+ // FuncCloneInfos size was smaller than the clone number.
+ for (auto &CF : FuncCloneInfos) {
+ if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
+ return CF.FuncClone;
+ }
+ llvm_unreachable(
+ "Expected an available func clone for this callsite clone");
+ };
+
// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
@@ -4819,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
- // Find first function in FuncClonesToCallMap without an assigned
- // clone of this callsite Node. We should always have one
- // available at this point due to the earlier cloning when the
- // FuncClonesToCallMap size was smaller than the clone number.
- for (auto &CF : FuncClonesToCallMap) {
- if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
- FuncCloneAssignedToCurCallsiteClone = CF.first;
- break;
- }
- }
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
@@ -4842,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneAssignedToCurCallsiteClone);
}
}
+ // If we didn't assign a function clone to this callsite clone yet, e.g.
+ // none of its callers has a non-null call, do the assignment here.
+ // We want to ensure that every callsite clone is assigned to some
+ // function clone, so that the call updates below work as expected.
+ // In particular if this is the original callsite, we want to ensure it
+ // is assigned to the original function, otherwise the original function
+ // will appear available for assignment to other callsite clones,
+ // leading to unintended effects. For one, the unknown and not updated
+ // callers will call into cloned paths leading to the wrong hints,
+ // because they still call the original function (clone 0). Also,
+ // because all callsites start out as being clone 0 by default, we can't
+ // easily distinguish between callsites explicitly assigned to clone 0
+ // vs those never assigned, which can lead to multiple updates of the
+ // calls when invoking updateCall below, with mismatched clone values.
+ // TODO: Add a flag to the callsite nodes or some other mechanism to
+ // better distinguish and identify callsite clones that are not getting
+ // assigned to function clones as expected.
+ if (!FuncCloneAssignedToCurCallsiteClone) {
+ FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
+ assert(FuncCloneAssignedToCurCallsiteClone &&
+ "No available func clone for this callsite clone");
+ AssignCallsiteCloneToFuncClone(
+ FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+ /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
+ }
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index e276376..4387c38 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -384,6 +384,10 @@ void splitAndWriteThinLTOBitcode(
for (auto &F : M)
if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
CfiFunctions.insert(&F);
+ for (auto &A : M.aliases())
+ if (auto *F = dyn_cast<Function>(A.getAliasee()))
+ if (HasTypeMetadata(F))
+ CfiFunctions.insert(&A);
// Remove all globals with type metadata, globals with comdats that live in
// MergedM, and aliases pointing to such globals from the thin LTO module.
@@ -403,12 +407,12 @@ void splitAndWriteThinLTOBitcode(
auto &Ctx = MergedM->getContext();
SmallVector<MDNode *, 8> CfiFunctionMDs;
for (auto *V : CfiFunctions) {
- Function &F = *cast<Function>(V);
+ Function &F = *cast<Function>(V->getAliaseeObject());
SmallVector<MDNode *, 2> Types;
F.getMetadata(LLVMContext::MD_type, Types);
SmallVector<Metadata *, 4> Elts;
- Elts.push_back(MDString::get(Ctx, F.getName()));
+ Elts.push_back(MDString::get(Ctx, V->getName()));
CfiFunctionLinkage Linkage;
if (lowertypetests::isJumpTableCanonical(&F))
Linkage = CFL_Definition;
@@ -428,29 +432,24 @@ void splitAndWriteThinLTOBitcode(
NMD->addOperand(MD);
}
- SmallVector<MDNode *, 8> FunctionAliases;
+ MapVector<Function *, std::vector<GlobalAlias *>> FunctionAliases;
for (auto &A : M.aliases()) {
if (!isa<Function>(A.getAliasee()))
continue;
auto *F = cast<Function>(A.getAliasee());
-
- Metadata *Elts[] = {
- MDString::get(Ctx, A.getName()),
- MDString::get(Ctx, F->getName()),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
- };
-
- FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
+ FunctionAliases[F].push_back(&A);
}
if (!FunctionAliases.empty()) {
NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
- for (auto *MD : FunctionAliases)
- NMD->addOperand(MD);
+ for (auto &Alias : FunctionAliases) {
+ SmallVector<Metadata *> Elts;
+ Elts.push_back(MDString::get(Ctx, Alias.first->getName()));
+ for (auto *A : Alias.second)
+ Elts.push_back(MDString::get(Ctx, A->getName()));
+ NMD->addOperand(MDTuple::get(Ctx, Elts));
+ }
}
SmallVector<MDNode *, 8> Symvers;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b231c04..d7971e8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -11,10 +11,13 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/FloatingPointPredicateUtils.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -3589,6 +3592,154 @@ static Value *foldOrOfInversions(BinaryOperator &I,
return nullptr;
}
+/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl"
+/// patterns, which extract vector elements and pack them in the same relative
+/// positions.
+///
+/// \p Vec is the underlying vector being extracted from.
+/// \p Mask is a bitmask identifying which packed elements are obtained from the
+/// vector.
+/// \p VecOffset is the vector element corresponding to index 0 of the
+/// mask.
+static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec,
+ int64_t &VecOffset,
+ SmallBitVector &Mask,
+ const DataLayout &DL) {
+ static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) {
+ ShlAmt = 0;
+ return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base);
+ };
+
+ // First try to match extractelement -> zext -> shl
+ uint64_t VecIdx, ShlAmt;
+ if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt(
+ m_Value(Vec), m_ConstantInt(VecIdx))),
+ ShlAmt))) {
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+ auto *EltTy = dyn_cast<IntegerType>(VecTy->getElementType());
+ if (!EltTy)
+ return false;
+
+ const unsigned EltBitWidth = EltTy->getBitWidth();
+ const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth();
+ if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0)
+ return false;
+ const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth;
+ const unsigned ShlEltAmt = ShlAmt / EltBitWidth;
+
+ const unsigned MaskIdx =
+ DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1;
+
+ VecOffset = static_cast<int64_t>(VecIdx) - static_cast<int64_t>(MaskIdx);
+ Mask.resize(TargetEltWidth);
+ Mask.set(MaskIdx);
+ return true;
+ }
+
+ // Now try to match a bitcasted subvector.
+ Instruction *SrcVecI;
+ if (!match(V, m_BitCast(m_Instruction(SrcVecI))))
+ return false;
+
+ auto *SrcTy = dyn_cast<FixedVectorType>(SrcVecI->getType());
+ if (!SrcTy)
+ return false;
+
+ Mask.resize(SrcTy->getNumElements());
+
+ // First check for a subvector obtained from a shufflevector.
+ if (isa<ShuffleVectorInst>(SrcVecI)) {
+ Constant *ConstVec;
+ ArrayRef<int> ShuffleMask;
+ if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec),
+ m_Mask(ShuffleMask))))
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+
+ const unsigned NumVecElts = VecTy->getNumElements();
+ bool FoundVecOffset = false;
+ for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) {
+ if (ShuffleMask[Idx] == PoisonMaskElem)
+ return false;
+ const unsigned ShuffleIdx = ShuffleMask[Idx];
+ if (ShuffleIdx >= NumVecElts) {
+ const unsigned ConstIdx = ShuffleIdx - NumVecElts;
+ auto *ConstElt =
+ dyn_cast<ConstantInt>(ConstVec->getAggregateElement(ConstIdx));
+ if (!ConstElt || !ConstElt->isNullValue())
+ return false;
+ continue;
+ }
+
+ if (FoundVecOffset) {
+ if (VecOffset + Idx != ShuffleIdx)
+ return false;
+ } else {
+ if (ShuffleIdx < Idx)
+ return false;
+ VecOffset = ShuffleIdx - Idx;
+ FoundVecOffset = true;
+ }
+ Mask.set(Idx);
+ }
+ return FoundVecOffset;
+ }
+
+ // Check for a subvector obtained as an (insertelement V, 0, idx)
+ uint64_t InsertIdx;
+ if (!match(SrcVecI,
+ m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx))))
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy)
+ return false;
+ VecOffset = 0;
+ bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx);
+ Mask.set();
+ if (!AlreadyInsertedMaskedElt)
+ Mask.reset(InsertIdx);
+ return true;
+}
+
+/// Try to fold the join of two scalar integers whose contents are packed
+/// elements of the same vector.
+static Instruction *foldIntegerPackFromVector(Instruction &I,
+ InstCombiner::BuilderTy &Builder,
+ const DataLayout &DL) {
+ assert(I.getOpcode() == Instruction::Or);
+ Value *LhsVec, *RhsVec;
+ int64_t LhsVecOffset, RhsVecOffset;
+ SmallBitVector Mask;
+ if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset,
+ Mask, DL))
+ return nullptr;
+ if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset,
+ Mask, DL))
+ return nullptr;
+ if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset)
+ return nullptr;
+
+ // Convert into shufflevector -> bitcast;
+ const unsigned ZeroVecIdx =
+ cast<FixedVectorType>(LhsVec->getType())->getNumElements();
+ SmallVector<int> ShuffleMask(Mask.size(), ZeroVecIdx);
+ for (unsigned Idx : Mask.set_bits()) {
+ assert(LhsVecOffset + Idx >= 0);
+ ShuffleMask[Idx] = LhsVecOffset + Idx;
+ }
+
+ Value *MaskedVec = Builder.CreateShuffleVector(
+ LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask,
+ I.getName() + ".v");
+ return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType());
+}
+
// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
// track these properities for preservation. Note that we can decompose
// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
@@ -3766,6 +3917,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (Instruction *X = foldComplexAndOrPatterns(I, Builder))
return X;
+ if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL))
+ return X;
+
// (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D
// (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C
if (Value *V = foldOrOfInversions(I, Builder))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 1b78ace..47e017e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3891,16 +3891,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
}
}
- // Try to fold intrinsic into select operands. This is legal if:
+ // Try to fold intrinsic into select/phi operands. This is legal if:
// * The intrinsic is speculatable.
// * The select condition is not a vector, or the intrinsic does not
// perform cross-lane operations.
if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) &&
isNotCrossLaneOperation(II))
- for (Value *Op : II->args())
+ for (Value *Op : II->args()) {
if (auto *Sel = dyn_cast<SelectInst>(Op))
if (Instruction *R = FoldOpIntoSelect(*II, Sel))
return R;
+ if (auto *Phi = dyn_cast<PHINode>(Op))
+ if (Instruction *R = foldOpIntoPhi(*II, Phi))
+ return R;
+ }
if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
return Shuf;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 033ef8b..a43a6ee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) &&
all_equal(Shuf->getShuffleMask()) &&
- Shuf->getType() == Shuf->getOperand(0)->getType()) {
+ ElementCount::isKnownGE(Shuf->getType()->getElementCount(),
+ cast<VectorType>(Shuf->getOperand(0)->getType())
+ ->getElementCount())) {
// trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask
// trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask
- Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+ Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType(
+ Trunc.getType()->getScalarType());
+ Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy);
return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask());
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index da9b126..b268fea 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -163,6 +163,11 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
LaterIndices.push_back(IdxVal);
}
+ Value *Idx = GEP->getOperand(2);
+ // If the index type is non-canonical, wait for it to be canonicalized.
+ if (Idx->getType() != DL.getIndexType(GEP->getType()))
+ return nullptr;
+
enum { Overdefined = -3, Undefined = -2 };
// Variables for our state machines.
@@ -290,17 +295,6 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
// Now that we've scanned the entire array, emit our new comparison(s). We
// order the state machines in complexity of the generated code.
- Value *Idx = GEP->getOperand(2);
-
- // If the index is larger than the pointer offset size of the target, truncate
- // the index down like the GEP would do implicitly. We don't have to do this
- // for an inbounds GEP because the index can't be out of range.
- if (!GEP->isInBounds()) {
- Type *PtrIdxTy = DL.getIndexType(GEP->getType());
- unsigned OffsetSize = PtrIdxTy->getIntegerBitWidth();
- if (Idx->getType()->getPrimitiveSizeInBits().getFixedValue() > OffsetSize)
- Idx = Builder.CreateTrunc(Idx, PtrIdxTy);
- }
// If inbounds keyword is not present, Idx * ElementSize can overflow.
// Let's assume that ElementSize is 2 and the wanted value is at offset 0.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e2a9255..56358b1 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1994,6 +1994,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
}
Clone = InsertNewInstBefore(Clone, OpBB->getTerminator()->getIterator());
Clones.insert({OpBB, Clone});
+ // We may have speculated the instruction.
+ Clone->dropUBImplyingAttrsAndMetadata();
}
NewPhiValues[OpIndex] = Clone;
@@ -2777,6 +2779,12 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
Indices.append(GEP.idx_begin()+1, GEP.idx_end());
}
+ // Don't create GEPs with more than one variable index.
+ unsigned NumVarIndices =
+ count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); });
+ if (NumVarIndices > 1)
+ return nullptr;
+
if (!Indices.empty())
return replaceInstUsesWith(
GEP, Builder.CreateGEP(
@@ -3176,7 +3184,16 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// If we are using a wider index than needed for this platform, shrink
// it to what we need. If narrower, sign-extend it to what we need.
// This explicit cast can make subsequent optimizations more obvious.
- *I = Builder.CreateIntCast(*I, NewIndexType, true);
+ if (IndexTy->getScalarSizeInBits() <
+ NewIndexType->getScalarSizeInBits()) {
+ if (GEP.hasNoUnsignedWrap() && GEP.hasNoUnsignedSignedWrap())
+ *I = Builder.CreateZExt(*I, NewIndexType, "", /*IsNonNeg=*/true);
+ else
+ *I = Builder.CreateSExt(*I, NewIndexType);
+ } else {
+ *I = Builder.CreateTrunc(*I, NewIndexType, "", GEP.hasNoUnsignedWrap(),
+ GEP.hasNoUnsignedSignedWrap());
+ }
MadeChange = true;
}
}
@@ -3199,6 +3216,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, NewGEP);
}
+ // Strip trailing zero indices.
+ auto *LastIdx = dyn_cast<Constant>(Indices.back());
+ if (LastIdx && LastIdx->isNullValue() && !LastIdx->getType()->isVectorTy()) {
+ return replaceInstUsesWith(
+ GEP, Builder.CreateGEP(GEP.getSourceElementType(), PtrOp,
+ drop_end(Indices), "", GEP.getNoWrapFlags()));
+ }
+
// Scalarize vector operands; prefer splat-of-gep.as canonical form.
// Note that this looses information about undef lanes; we run it after
// demanded bits to partially mitigate that loss.
@@ -3225,6 +3250,30 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, Res);
}
+ bool SeenVarIndex = false;
+ for (auto [IdxNum, Idx] : enumerate(Indices)) {
+ if (isa<Constant>(Idx))
+ continue;
+
+ if (!SeenVarIndex) {
+ SeenVarIndex = true;
+ continue;
+ }
+
+ // GEP has multiple variable indices: Split it.
+ ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
+ Value *FrontGEP =
+ Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
+ GEP.getName() + ".split", GEP.getNoWrapFlags());
+
+ SmallVector<Value *> BackIndices;
+ BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy));
+ append_range(BackIndices, drop_begin(Indices, IdxNum));
+ return GetElementPtrInst::Create(
+ GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP,
+ BackIndices, GEP.getNoWrapFlags());
+ }
+
// Check to see if the inputs to the PHI node are getelementptr instructions.
if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder))
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 15fd421..0a97ed4 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_component_library(LLVMInstrumentation
LowerAllowCheckPass.cpp
PGOCtxProfFlattening.cpp
PGOCtxProfLowering.cpp
+ PGOEstimateTripCounts.cpp
PGOForceFunctionAttrs.cpp
PGOInstrumentation.cpp
PGOMemOPSizeOpt.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index df31f07..54d9a83 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Approximately handle AVX Galois Field Affine Transformation
+ //
+ // e.g.,
+ // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+ // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+ // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+ // Out A x b
+ // where A and x are packed matrices, b is a vector,
+ // Out = A * x + b in GF(2)
+ //
+ // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
+ // computation also includes a parity calculation.
+ //
+ // For the bitwise AND of bits V1 and V2, the exact shadow is:
+ // Out_Shadow = (V1_Shadow & V2_Shadow)
+ // | (V1 & V2_Shadow)
+ // | (V1_Shadow & V2 )
+ //
+ // We approximate the shadow of gf2p8affineqb using:
+ // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0)
+ // | gf2p8affineqb(x, A_shadow, 0)
+ // | gf2p8affineqb(x_Shadow, A, 0)
+ // | set1_epi8(b_Shadow)
+ //
+ // This approximation has false negatives: if an intermediate dot-product
+ // contains an even number of 1's, the parity is 0.
+ // It has no false positives.
+ void handleAVXGF2P8Affine(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+
+ assert(I.arg_size() == 3);
+ Value *A = I.getOperand(0);
+ Value *X = I.getOperand(1);
+ Value *B = I.getOperand(2);
+
+ assert(isFixedIntVector(A));
+ assert(cast<VectorType>(A->getType())
+ ->getElementType()
+ ->getScalarSizeInBits() == 8);
+
+ assert(A->getType() == X->getType());
+
+ assert(B->getType()->isIntegerTy());
+ assert(B->getType()->getScalarSizeInBits() == 8);
+
+ assert(I.getType() == A->getType());
+
+ Value *AShadow = getShadow(A);
+ Value *XShadow = getShadow(X);
+ Value *BZeroShadow = getCleanShadow(B);
+
+ CallInst *AShadowXShadow = IRB.CreateIntrinsic(
+ I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow});
+ CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {X, AShadow, BZeroShadow});
+ CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {XShadow, A, BZeroShadow});
+
+ unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
+ Value *BShadow = getShadow(B);
+ Value *BBroadcastShadow = getCleanShadow(AShadow);
+ // There is no LLVM IR intrinsic for _mm512_set1_epi8.
+ // This loop generates a lot of LLVM IR, which we expect that CodeGen will
+ // lower appropriately (e.g., VPBROADCASTB).
+ // Besides, b is often a constant, in which case it is fully initialized.
+ for (unsigned i = 0; i < NumElements; i++)
+ BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i);
+
+ setShadow(&I, IRB.CreateOr(
+ {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow}));
+ setOriginForNaryOp(I);
+ }
+
// Handle Arm NEON vector load intrinsics (vld*).
//
// The WithLane instructions (ld[234]lane) are similar to:
@@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ // AVX Galois Field New Instructions
+ case Intrinsic::x86_vgf2p8affineqb_128:
+ case Intrinsic::x86_vgf2p8affineqb_256:
+ case Intrinsic::x86_vgf2p8affineqb_512:
+ handleAVXGF2P8Affine(I);
+ break;
+
case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOEstimateTripCounts.cpp b/llvm/lib/Transforms/Instrumentation/PGOEstimateTripCounts.cpp
new file mode 100644
index 0000000..762aca0
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOEstimateTripCounts.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOEstimateTripCounts.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-estimate-trip-counts"
+
+static bool runOnLoop(Loop *L) {
+ bool MadeChange = false;
+ std::optional<unsigned> TC = getLoopEstimatedTripCount(
+ L, /*EstimatedLoopInvocationWeight=*/nullptr, /*DbgForInit=*/true);
+ MadeChange |= setLoopEstimatedTripCount(L, TC);
+ for (Loop *SL : *L)
+ MadeChange |= runOnLoop(SL);
+ return MadeChange;
+}
+
+PreservedAnalyses PGOEstimateTripCountsPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ bool MadeChange = false;
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": start\n");
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ LoopInfo *LI = &FAM.getResult<LoopAnalysis>(F);
+ if (!LI)
+ continue;
+ for (Loop *L : *LI)
+ MadeChange |= runOnLoop(L);
+ }
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": end\n");
+ return MadeChange ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index f6780c0..ce1d9f1 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
if (DisableMemOPOPT)
return false;
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
MemOPSizeOpt.perform();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 0f63ed0..9b87180 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1360,13 +1360,10 @@ struct DSEState {
/// indicating whether \p I is a free-like call.
std::optional<std::pair<MemoryLocation, bool>>
getLocForTerminator(Instruction *I) const {
- uint64_t Len;
- Value *Ptr;
- if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
- m_Value(Ptr))))
- return {std::make_pair(MemoryLocation(Ptr, Len), false)};
-
if (auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
+ return {
+ std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
if (Value *FreedOp = getFreedOperand(CB, &TLI))
return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
}
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index e706a6f..deff79b 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -237,8 +237,7 @@ class InductiveRangeCheckElimination {
DominatorTree &DT;
LoopInfo &LI;
- using GetBFIFunc =
- std::optional<llvm::function_ref<llvm::BlockFrequencyInfo &()>>;
+ using GetBFIFunc = llvm::function_ref<llvm::BlockFrequencyInfo &()>;
GetBFIFunc GetBFI;
// Returns the estimated number of iterations based on block frequency info if
@@ -249,7 +248,7 @@ class InductiveRangeCheckElimination {
public:
InductiveRangeCheckElimination(ScalarEvolution &SE,
BranchProbabilityInfo *BPI, DominatorTree &DT,
- LoopInfo &LI, GetBFIFunc GetBFI = std::nullopt)
+ LoopInfo &LI, GetBFIFunc GetBFI = nullptr)
: SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {}
bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
@@ -959,7 +958,7 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
std::optional<uint64_t>
InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) {
if (GetBFI) {
- BlockFrequencyInfo &BFI = (*GetBFI)();
+ BlockFrequencyInfo &BFI = GetBFI();
uint64_t hFreq = BFI.getBlockFreq(L.getHeader()).getFrequency();
uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
if (phFreq == 0 || hFreq == 0)
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 6a3f656..1a52af1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -651,7 +651,7 @@ class NewGVN {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
- mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred;
+ mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -840,7 +840,7 @@ private:
// Ranking
unsigned int getRank(const Value *) const;
bool shouldSwapOperands(const Value *, const Value *) const;
- bool shouldSwapOperandsForIntrinsic(const Value *, const Value *,
+ bool shouldSwapOperandsForPredicate(const Value *, const Value *,
const IntrinsicInst *I) const;
// Reachability handling.
@@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
Value *AdditionallyUsedValue = CmpOp0;
// Sort the ops.
- if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) {
+ if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) {
std::swap(FirstOp, SecondOp);
Predicate = CmpInst::getSwappedPredicate(Predicate);
AdditionallyUsedValue = CmpOp1;
@@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() {
PredicateToUsers.clear();
MemoryToUsers.clear();
RevisitOnReachabilityChange.clear();
- IntrinsicInstPred.clear();
+ PredicateSwapChoice.clear();
}
// Assign local DFS number mapping to instructions, and leave space for Value
@@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
}
-bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B,
+bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B,
const IntrinsicInst *I) const {
- auto LookupResult = IntrinsicInstPred.find(I);
if (shouldSwapOperands(A, B)) {
- if (LookupResult == IntrinsicInstPred.end())
- IntrinsicInstPred.insert({I, B});
- else
- LookupResult->second = B;
+ PredicateSwapChoice[I] = B;
return true;
}
- if (LookupResult != IntrinsicInstPred.end()) {
+ auto LookupResult = PredicateSwapChoice.find(I);
+ if (LookupResult != PredicateSwapChoice.end()) {
auto *SeenPredicate = LookupResult->second;
if (SeenPredicate) {
+ // We previously decided to swap B to the left. Keep that choice.
if (SeenPredicate == B)
return true;
else
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9fe655e..fca09c6 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) {
static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
DominatorTree *DT) {
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (F.hasOptSize())
return false;
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
LibCallsShrinkWrap CCDCE(TLI, DTU);
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e7623aa..9043baa 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -201,34 +201,40 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
}
/// Create MDNode for input string.
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name,
+ std::optional<unsigned> V) {
LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {
- MDString::get(Context, Name),
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
+ if (V) {
+ Metadata *MDs[] = {MDString::get(Context, Name),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Context), *V))};
+ return MDNode::get(Context, MDs);
+ }
+ return MDNode::get(Context, {MDString::get(Context, Name)});
}
-/// Set input string into loop metadata by keeping other values intact.
-/// If the string is already in loop metadata update value if it is
-/// different.
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
- unsigned V) {
+bool llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
+ std::optional<unsigned> V) {
SmallVector<Metadata *, 4> MDs(1);
// If the loop already has metadata, retain it.
MDNode *LoopID = TheLoop->getLoopID();
if (LoopID) {
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- // If it is of form key = value, try to parse it.
- if (Node->getNumOperands() == 2) {
+ // If it is of form key [= value], try to parse it.
+ unsigned NumOps = Node->getNumOperands();
+ if (NumOps == 1 || NumOps == 2) {
MDString *S = dyn_cast<MDString>(Node->getOperand(0));
if (S && S->getString() == StringMD) {
- ConstantInt *IntMD =
- mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
- if (IntMD && IntMD->getSExtValue() == V)
- // It is already in place. Do nothing.
- return;
+ // If the metadata and any value are already as specified, do nothing.
+ if (NumOps == 2 && V) {
+ ConstantInt *IntMD =
+ mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
+ if (IntMD && IntMD->getSExtValue() == *V)
+ return false;
+ } else if (NumOps == 1 && !V) {
+ return false;
+ }
// We need to update the value, so just skip it here and it will
// be added after copying other existed nodes.
continue;
@@ -245,6 +251,7 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);
TheLoop->setLoopID(NewLoopID);
+ return true;
}
std::optional<ElementCount>
@@ -804,26 +811,48 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
return LatchBR;
}
-/// Return the estimated trip count for any exiting branch which dominates
-/// the loop latch.
-static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
- Loop *L,
- uint64_t &OrigExitWeight) {
+struct DbgLoop {
+ const Loop *L;
+ explicit DbgLoop(const Loop *L) : L(L) {}
+};
+static inline raw_ostream &operator<<(raw_ostream &OS, DbgLoop D) {
+ OS << "function ";
+ D.L->getHeader()->getParent()->printAsOperand(OS, /*PrintType=*/false);
+ return OS << " " << *D.L;
+}
+
+static std::optional<unsigned> estimateLoopTripCount(Loop *L) {
+ // Currently we take the estimate exit count only from the loop latch,
+ // ignoring other exiting blocks. This can overestimate the trip count
+ // if we exit through another exit, but can never underestimate it.
+ // TODO: incorporate information from other exits
+ BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
+ if (!ExitingBranch) {
+ LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed to find exiting "
+ << "latch branch of required form in " << DbgLoop(L)
+ << "\n");
+ return std::nullopt;
+ }
+
// To estimate the number of times the loop body was executed, we want to
// know the number of times the backedge was taken, vs. the number of times
// we exited the loop.
uint64_t LoopWeight, ExitWeight;
- if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight))
+ if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight)) {
+ LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed to extract branch "
+ << "weights for " << DbgLoop(L) << "\n");
return std::nullopt;
+ }
if (L->contains(ExitingBranch->getSuccessor(1)))
std::swap(LoopWeight, ExitWeight);
- if (!ExitWeight)
+ if (!ExitWeight) {
// Don't have a way to return predicated infinite
+ LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed because of zero exit "
+ << "probability for " << DbgLoop(L) << "\n");
return std::nullopt;
-
- OrigExitWeight = ExitWeight;
+ }
// Estimated exit count is a ratio of the loop weight by the weight of the
// edge exiting the loop, rounded to nearest.
@@ -834,33 +863,86 @@ static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
return std::numeric_limits<unsigned>::max();
// Estimated trip count is one plus estimated exit count.
- return ExitCount + 1;
+ uint64_t TC = ExitCount + 1;
+ LLVM_DEBUG(dbgs() << "estimateLoopTripCount: estimated trip count of " << TC
+ << " for " << DbgLoop(L) << "\n");
+ return TC;
}
-std::optional<unsigned>
-llvm::getLoopEstimatedTripCount(Loop *L,
- unsigned *EstimatedLoopInvocationWeight) {
- // Currently we take the estimate exit count only from the loop latch,
- // ignoring other exiting blocks. This can overestimate the trip count
- // if we exit through another exit, but can never underestimate it.
- // TODO: incorporate information from other exits
- if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) {
- uint64_t ExitWeight;
- if (std::optional<uint64_t> EstTripCount =
- getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
- if (EstimatedLoopInvocationWeight)
- *EstimatedLoopInvocationWeight = ExitWeight;
- return *EstTripCount;
+std::optional<unsigned> llvm::getLoopEstimatedTripCount(
+ Loop *L, unsigned *EstimatedLoopInvocationWeight, bool DbgForInit) {
+ // If requested, either compute *EstimatedLoopInvocationWeight or return
+ // nullopt if cannot.
+ //
+ // TODO: Eventually, once all passes have migrated away from setting branch
+ // weights to indicate estimated trip counts, this function will drop the
+ // EstimatedLoopInvocationWeight parameter.
+ if (EstimatedLoopInvocationWeight) {
+ if (BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L)) {
+ uint64_t LoopWeight = 0, ExitWeight = 0; // Inits expected to be unused.
+ if (!extractBranchWeights(*ExitingBranch, LoopWeight, ExitWeight))
+ return std::nullopt;
+ if (L->contains(ExitingBranch->getSuccessor(1)))
+ std::swap(LoopWeight, ExitWeight);
+ if (!ExitWeight)
+ return std::nullopt;
+ *EstimatedLoopInvocationWeight = ExitWeight;
}
}
- return std::nullopt;
+
+ // Return the estimated trip count from metadata unless the metadata is
+ // missing or has no value.
+ bool Missing = false; // Initialization is expected to be unused.
+ if (auto TC = getOptionalIntLoopAttribute(L, LLVMLoopEstimatedTripCount,
+ &Missing)) {
+ LLVM_DEBUG(dbgs() << "getLoopEstimatedTripCount: "
+ << LLVMLoopEstimatedTripCount << " metadata has trip "
+ << "count of " << *TC << " for " << DbgLoop(L) << "\n");
+ return TC;
+ }
+
+ // Estimate the trip count from latch branch weights.
+ std::optional<unsigned> TC = estimateLoopTripCount(L);
+ if (DbgForInit) {
+ // We expect no existing metadata as we are responsible for creating it.
+ LLVM_DEBUG(dbgs() << (Missing ? "" : "WARNING: ")
+ << "getLoopEstimatedTripCount: "
+ << LLVMLoopEstimatedTripCount << " metadata "
+ << (Missing ? "" : "not ") << "missing as expected "
+ << "during its init for " << DbgLoop(L) << "\n");
+ } else if (Missing) {
+ // We expect that metadata was already created.
+ LLVM_DEBUG(dbgs() << "WARNING: getLoopEstimatedTripCount: "
+ << LLVMLoopEstimatedTripCount << " metadata missing for "
+ << DbgLoop(L) << "\n");
+ } else {
+ // If the trip count is estimable, the value should have been added already.
+ LLVM_DEBUG(dbgs() << (TC ? "WARNING: " : "")
+ << "getLoopEstimatedTripCount: "
+ << LLVMLoopEstimatedTripCount << " metadata "
+ << (TC ? "incorrectly " : "correctly ")
+ << "indicates trip count is inestimable for "
+ << DbgLoop(L) << "\n");
+ }
+ return TC;
}
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
- unsigned EstimatedloopInvocationWeight) {
- // At the moment, we currently support changing the estimate trip count of
- // the latch branch only. We could extend this API to manipulate estimated
- // trip counts for any exit.
+bool llvm::setLoopEstimatedTripCount(
+ Loop *L, std::optional<unsigned> EstimatedTripCount,
+ std::optional<unsigned> EstimatedloopInvocationWeight) {
+ // Set the metadata.
+ bool Updated = addStringMetadataToLoop(L, LLVMLoopEstimatedTripCount,
+ EstimatedTripCount);
+ if (!EstimatedTripCount || !EstimatedloopInvocationWeight)
+ return Updated;
+
+ // At the moment, we currently support changing the estimated trip count in
+ // the latch branch's branch weights only. We could extend this API to
+ // manipulate estimated trip counts for any exit.
+ //
+ // TODO: Eventually, once all passes have migrated away from setting branch
+ // weights to indicate estimated trip counts, we will not set branch weights
+ // here at all.
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return false;
@@ -869,9 +951,9 @@ bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
unsigned LatchExitWeight = 0;
unsigned BackedgeTakenWeight = 0;
- if (EstimatedTripCount > 0) {
- LatchExitWeight = EstimatedloopInvocationWeight;
- BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
+ if (*EstimatedTripCount != 0) {
+ LatchExitWeight = *EstimatedloopInvocationWeight;
+ BackedgeTakenWeight = (*EstimatedTripCount - 1) * LatchExitWeight;
}
// Make a swap if back edge is taken when condition is "false".
@@ -885,7 +967,7 @@ bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
LLVMContext::MD_prof,
MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
- return true;
+ return Updated;
}
bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b9292af..b78c702 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -703,6 +703,7 @@ private:
// Add U as additional user of V.
void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
+ void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI);
void handleCallOverdefined(CallBase &CB);
void handleCallResult(CallBase &CB);
void handleCallArguments(CallBase &CB);
@@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
}
}
+void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf,
+ const PredicateBase *PI) {
+ ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+ const std::optional<PredicateConstraint> &Constraint = PI->getConstraint();
+ if (!Constraint) {
+ mergeInValue(ValueState[I], I, CopyOfVal);
+ return;
+ }
+
+ CmpInst::Predicate Pred = Constraint->Predicate;
+ Value *OtherOp = Constraint->OtherOp;
+
+ // Wait until OtherOp is resolved.
+ if (getValueState(OtherOp).isUnknown()) {
+ addAdditionalUser(OtherOp, I);
+ return;
+ }
+
+ ValueLatticeElement CondVal = getValueState(OtherOp);
+ ValueLatticeElement &IV = ValueState[I];
+ if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+ auto ImposedCR =
+ ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+ // Get the range imposed by the condition.
+ if (CondVal.isConstantRange())
+ ImposedCR = ConstantRange::makeAllowedICmpRegion(
+ Pred, CondVal.getConstantRange());
+
+ // Combine range info for the original value with the new range from the
+ // condition.
+ auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
+ /*UndefAllowed=*/true);
+ // Treat an unresolved input like a full range.
+ if (CopyOfCR.isEmptySet())
+ CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
+ auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+ // If the existing information is != x, do not use the information from
+ // a chained predicate, as the != x information is more likely to be
+ // helpful in practice.
+ if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+ NewCR = CopyOfCR;
+
+ // The new range is based on a branch condition. That guarantees that
+ // neither of the compare operands can be undef in the branch targets,
+ // unless we have conditions that are always true/false (e.g. icmp ule
+ // i32, %a, i32_max). For the latter overdefined/empty range will be
+ // inferred, but the branch will get folded accordingly anyways.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(
+ IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
+ return;
+ } else if (Pred == CmpInst::ICMP_EQ &&
+ (CondVal.isConstant() || CondVal.isNotConstant())) {
+ // For non-integer values or integer constant expressions, only
+ // propagate equal constants or not-constants.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, CondVal);
+ return;
+ } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
+ // Propagate inequalities.
+ addAdditionalUser(OtherOp, I);
+ mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant()));
+ return;
+ }
+
+ return (void)mergeInValue(IV, I, CopyOfVal);
+}
+
void SCCPInstVisitor::handleCallResult(CallBase &CB) {
Function *F = CB.getCalledFunction();
@@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
return;
Value *CopyOf = CB.getOperand(0);
- ValueLatticeElement CopyOfVal = getValueState(CopyOf);
- const auto *PI = getPredicateInfoFor(&CB);
+ const PredicateBase *PI = getPredicateInfoFor(&CB);
assert(PI && "Missing predicate info for ssa.copy");
-
- const std::optional<PredicateConstraint> &Constraint =
- PI->getConstraint();
- if (!Constraint) {
- mergeInValue(ValueState[&CB], &CB, CopyOfVal);
- return;
- }
-
- CmpInst::Predicate Pred = Constraint->Predicate;
- Value *OtherOp = Constraint->OtherOp;
-
- // Wait until OtherOp is resolved.
- if (getValueState(OtherOp).isUnknown()) {
- addAdditionalUser(OtherOp, &CB);
- return;
- }
-
- ValueLatticeElement CondVal = getValueState(OtherOp);
- ValueLatticeElement &IV = ValueState[&CB];
- if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
- auto ImposedCR =
- ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
-
- // Get the range imposed by the condition.
- if (CondVal.isConstantRange())
- ImposedCR = ConstantRange::makeAllowedICmpRegion(
- Pred, CondVal.getConstantRange());
-
- // Combine range info for the original value with the new range from the
- // condition.
- auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(),
- /*UndefAllowed=*/true);
- // Treat an unresolved input like a full range.
- if (CopyOfCR.isEmptySet())
- CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth());
- auto NewCR = ImposedCR.intersectWith(CopyOfCR);
- // If the existing information is != x, do not use the information from
- // a chained predicate, as the != x information is more likely to be
- // helpful in practice.
- if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
- NewCR = CopyOfCR;
-
- // The new range is based on a branch condition. That guarantees that
- // neither of the compare operands can be undef in the branch targets,
- // unless we have conditions that are always true/false (e.g. icmp ule
- // i32, %a, i32_max). For the latter overdefined/empty range will be
- // inferred, but the branch will get folded accordingly anyways.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(
- IV, &CB,
- ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false));
- return;
- } else if (Pred == CmpInst::ICMP_EQ &&
- (CondVal.isConstant() || CondVal.isNotConstant())) {
- // For non-integer values or integer constant expressions, only
- // propagate equal constants or not-constants.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB, CondVal);
- return;
- } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) {
- // Propagate inequalities.
- addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB,
- ValueLatticeElement::getNot(CondVal.getConstant()));
- return;
- }
-
- return (void)mergeInValue(IV, &CB, CopyOfVal);
+ handlePredicate(&CB, CopyOf, PI);
+ return;
}
if (II->getIntrinsicID() == Intrinsic::vscale) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index ddb062b..1eb8996 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1249,7 +1249,8 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
// offset, if the offset is simpler.
const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
const SCEV *Op = Diff;
- match(Diff, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
+ match(Op, m_scev_Add(m_SCEVConstant(), m_SCEV(Op)));
+ match(Op, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
match(Op, m_scev_PtrToInt(m_SCEV(Op)));
if (!isa<SCEVConstant, SCEVUnknown>(Op))
continue;
@@ -1257,7 +1258,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
assert(Diff->getType()->isIntegerTy() &&
"difference must be of integer type");
Value *DiffV = expand(Diff);
- Value *BaseV = &PN;
+ Value *BaseV = fixupLCSSAFormFor(&PN);
if (PhiTy->isPointerTy()) {
if (STy->isPointerTy())
return Builder.CreatePtrAdd(BaseV, DiffV);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 94b0ab8..674de57 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -198,6 +198,11 @@ static cl::opt<unsigned> MaxSwitchCasesPerResult(
"max-switch-cases-per-result", cl::Hidden, cl::init(16),
cl::desc("Limit cases to analyze when converting a switch to select"));
+static cl::opt<unsigned> MaxJumpThreadingLiveBlocks(
+ "max-jump-threading-live-blocks", cl::Hidden, cl::init(24),
+ cl::desc("Limit number of blocks a define in a threaded block is allowed "
+ "to be live in"));
+
STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
STATISTIC(NumLinearMaps,
"Number of switch instructions turned into linear mapping");
@@ -3390,8 +3395,27 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
return true;
}
+using BlocksSet = SmallPtrSet<BasicBlock *, 8>;
+
+// Return false if number of blocks searched is too much.
+static bool findReaching(BasicBlock *BB, BasicBlock *DefBB,
+ BlocksSet &ReachesNonLocalUses) {
+ if (BB == DefBB)
+ return true;
+ if (!ReachesNonLocalUses.insert(BB).second)
+ return true;
+
+ if (ReachesNonLocalUses.size() > MaxJumpThreadingLiveBlocks)
+ return false;
+ for (BasicBlock *Pred : predecessors(BB))
+ if (!findReaching(Pred, DefBB, ReachesNonLocalUses))
+ return false;
+ return true;
+}
+
/// Return true if we can thread a branch across this block.
-static bool blockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+static bool blockIsSimpleEnoughToThreadThrough(BasicBlock *BB,
+ BlocksSet &NonLocalUseBlocks) {
int Size = 0;
EphemeralValueTracker EphTracker;
@@ -3411,12 +3435,16 @@ static bool blockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
return false; // Don't clone large BB's.
}
- // We can only support instructions that do not define values that are
- // live outside of the current basic block.
+ // Record blocks with non-local uses of values defined in the current basic
+ // block.
for (User *U : I.users()) {
Instruction *UI = cast<Instruction>(U);
- if (UI->getParent() != BB || isa<PHINode>(UI))
- return false;
+ BasicBlock *UsedInBB = UI->getParent();
+ if (UsedInBB == BB) {
+ if (isa<PHINode>(UI))
+ return false;
+ } else
+ NonLocalUseBlocks.insert(UsedInBB);
}
// Looks ok, continue checking.
@@ -3475,18 +3503,37 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
return false;
// Now we know that this block has multiple preds and two succs.
- // Check that the block is small enough and values defined in the block are
- // not used outside of it.
- if (!blockIsSimpleEnoughToThreadThrough(BB))
+ // Check that the block is small enough and record which non-local blocks use
+ // values defined in the block.
+
+ BlocksSet NonLocalUseBlocks;
+ BlocksSet ReachesNonLocalUseBlocks;
+ if (!blockIsSimpleEnoughToThreadThrough(BB, NonLocalUseBlocks))
return false;
+ // Jump-threading can only be done to destinations where no values defined
+ // in BB are live.
+
+ // Quickly check if both destinations have uses. If so, jump-threading cannot
+ // be done.
+ if (NonLocalUseBlocks.contains(BI->getSuccessor(0)) &&
+ NonLocalUseBlocks.contains(BI->getSuccessor(1)))
+ return false;
+
+ // Search backward from NonLocalUseBlocks to find which blocks
+ // reach non-local uses.
+ for (BasicBlock *UseBB : NonLocalUseBlocks)
+ // Give up if too many blocks are searched.
+ if (!findReaching(UseBB, BB, ReachesNonLocalUseBlocks))
+ return false;
+
for (const auto &Pair : KnownValues) {
- // Okay, we now know that all edges from PredBB should be revectored to
- // branch to RealDest.
ConstantInt *CB = Pair.first;
ArrayRef<BasicBlock *> PredBBs = Pair.second.getArrayRef();
BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+ // Okay, we now know that all edges from PredBB should be revectored to
+ // branch to RealDest.
if (RealDest == BB)
continue; // Skip self loops.
@@ -3496,6 +3543,10 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
}))
continue;
+ // Only revector to RealDest if no values defined in BB are live.
+ if (ReachesNonLocalUseBlocks.contains(RealDest))
+ continue;
+
LLVM_DEBUG({
dbgs() << "Condition " << *Cond << " in " << BB->getName()
<< " has value " << *Pair.first << " in predecessors:\n";
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index f57ce0c..ea0fa06 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -170,8 +170,7 @@ public:
new VPInstruction(Opcode, Operands, Flags, DL, Name));
}
- VPInstruction *createNaryOp(unsigned Opcode,
- std::initializer_list<VPValue *> Operands,
+ VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
Type *ResultTy, const VPIRFlags &Flags = {},
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
@@ -180,7 +179,7 @@ public:
}
VPInstruction *createOverflowingOp(unsigned Opcode,
- std::initializer_list<VPValue *> Operands,
+ ArrayRef<VPValue *> Operands,
VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6616e61f..850c4a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -93,6 +93,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -155,6 +156,7 @@
#include <utility>
using namespace llvm;
+using namespace SCEVPatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// ElementCount to include loops whose trip count is a function of vscale.
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
const Loop *L) {
- return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
+ if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
+ return ElementCount::getFixed(ExpectedTC);
+
+ const SCEV *BTC = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BTC))
+ return ElementCount::getFixed(0);
+
+ const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
+ if (isa<SCEVVScale>(ExitCount))
+ return ElementCount::getScalable(1);
+
+ const APInt *Scale;
+ if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
+ if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
+ if (Scale->getActiveBits() <= 32)
+ return ElementCount::getScalable(Scale->getZExtValue());
+
+ return ElementCount::getFixed(0);
}
/// Returns "best known" trip count, which is either a valid positive trip count
@@ -1363,11 +1382,15 @@ public:
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
if (EVLIsLegal)
return;
- // If for some reason EVL mode is unsupported, fallback to
- // DataWithoutLaneMask to try to vectorize the loop with folded tail
- // in a generic way.
- ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
- TailFoldingStyle::DataWithoutLaneMask};
+ // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
+ // if it's allowed, or DataWithoutLaneMask otherwise.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+ else
+ ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask};
+
LLVM_DEBUG(
dbgs() << "LV: Preference for VP intrinsics indicated. Will "
"not try to generate VP Intrinsics "
@@ -2589,12 +2612,12 @@ static void cse(BasicBlock *BB) {
}
}
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// This function attempts to return a value that represents the ElementCount
+/// at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
- std::optional<unsigned> VScale) {
+static unsigned estimateElementCount(ElementCount VF,
+ std::optional<unsigned> VScale) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (VScale)
@@ -2704,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// use the value of vscale used for tuning.
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
unsigned EstimatedVFxUF =
- getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+ estimateElementCount(VF * UF, Cost->getVScaleForTuning());
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}
@@ -2999,7 +3022,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// is correct. The easiest form of the later is to require that all values
// stored are the same.
return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
- Legal->isInvariant(cast<StoreInst>(I)->getValueOperand()));
+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -3127,7 +3150,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
!isScalarEpilogueAllowed();
bool StoreAccessWithGapsRequiresMasking =
- isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
+ isa<StoreInst>(I) && !Group->isFull();
if (!PredicatedAccessRequiresMasking &&
!LoadAccessWithGapsRequiresEpilogMasking &&
!StoreAccessWithGapsRequiresMasking)
@@ -4333,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
- getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
+ estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
@@ -4441,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
+ return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
MinVFThreshold;
}
@@ -4494,25 +4517,23 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
- getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
+ estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
- if (MainLoopVF.isFixed()) {
- // TODO: extend to support scalable VFs.
- const SCEV *TC = vputils::getSCEVExprForVPValue(
- getPlanFor(MainLoopVF).getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TC) &&
- "Trip count SCEV must be computable");
- RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
-
- // No iterations left to process in the epilogue.
- if (RemainingIterations->isZero())
- return Result;
+ const SCEV *TC =
+ vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+ RemainingIterations =
+ SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+
+ // No iterations left to process in the epilogue.
+ if (RemainingIterations->isZero())
+ return Result;
+ if (MainLoopVF.isFixed()) {
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
@@ -4743,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
-
// Try to get the exact trip count, or an estimate based on profiling data or
// ConstantMax from PSE, failing that.
- if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
+ auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
+
+ // For fixed length VFs treat a scalable trip count as unknown.
+ if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
+ // Re-evaluate trip counts and VFs to be in the same numerical space.
+ unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
+ unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);
+
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
- unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
- ? BestKnownTC->getFixedValue() - 1
- : BestKnownTC->getFixedValue();
+ if (requiresScalarEpilogue(VF.isVector()))
+ --AvailableTC;
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
@@ -5347,7 +5372,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
- (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
+ (isa<StoreInst>(I) && !Group->isFull());
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
@@ -6923,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
- unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
+ unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
@@ -7290,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+ // Canonicalize EVL loops after regions are dissolved.
+ VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
@@ -9609,7 +9636,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
- unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
+ unsigned IntVF = estimateElementCount(VF.Width, VScale);
uint64_t RtC = TotalCost.getValue();
uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 225658b..d249a34 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2530,8 +2530,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
else
NextInChain = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
- PrevInChain);
+ (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
+ PrevInChain, NewRed);
}
State.set(this, NextInChain, /*IsScalar*/ true);
}
@@ -3391,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
- VectorType *InterleaveTy =
- VectorType::get(VecTy->getElementType(),
- VecTy->getElementCount().multiplyCoefficientBy(Factor));
- return Builder.CreateIntrinsic(InterleaveTy,
- getInterleaveIntrinsicID(Factor), Vals,
- /*FMFSource=*/nullptr, Name);
+ return Builder.CreateVectorInterleave(Vals, Name);
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3503,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
NewLoad = State.Builder.CreateIntrinsic(
- getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
- NewLoad,
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 935a4e4..a1d12a3a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -32,11 +32,11 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/TypeSize.h"
using namespace llvm;
+using namespace VPlanPatternMatch;
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlanPtr &Plan,
@@ -528,13 +528,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
/// Returns true if \p R is dead and can be removed.
static bool isDeadRecipe(VPRecipeBase &R) {
- using namespace llvm::PatternMatch;
// Do remove conditional assume instructions as their conditions may be
// flattened.
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
- bool IsConditionalAssume =
- RepR && RepR->isPredicated() &&
- match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
+ bool IsConditionalAssume = RepR && RepR->isPredicated() &&
+ match(RepR, m_Intrinsic<Intrinsic::assume>());
if (IsConditionalAssume)
return true;
@@ -625,7 +623,6 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
/// original IV's users. This is an optional optimization to reduce the needs of
/// vector extracts.
static void legalizeAndOptimizeInductions(VPlan &Plan) {
- using namespace llvm::VPlanPatternMatch;
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
@@ -727,7 +724,6 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
return nullptr;
auto IsWideIVInc = [&]() {
- using namespace VPlanPatternMatch;
auto &ID = WideIV->getInductionDescriptor();
// Check if VPV increments the induction by the induction step.
@@ -771,8 +767,6 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
VPTypeAnalysis &TypeInfo,
VPBlockBase *PredVPBB,
VPValue *Op) {
- using namespace VPlanPatternMatch;
-
VPValue *Incoming, *Mask;
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
m_VPInstruction<VPInstruction::FirstActiveLane>(
@@ -827,8 +821,6 @@ static VPValue *
optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
VPBlockBase *PredVPBB, VPValue *Op,
DenseMap<VPValue *, VPValue *> &EndValues) {
- using namespace VPlanPatternMatch;
-
VPValue *Incoming;
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastElement>(
m_VPValue(Incoming))))
@@ -986,7 +978,6 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
/// Try to simplify recipe \p R.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
- using namespace llvm::VPlanPatternMatch;
VPlan *Plan = R.getParent()->getPlan();
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
@@ -1094,6 +1085,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return Def->replaceAllUsesWith(A);
+ if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
+ return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
+ : R.getOperand(0));
+
if (match(Def, m_Not(m_VPValue(A)))) {
if (match(A, m_Not(m_VPValue(A))))
return Def->replaceAllUsesWith(A);
@@ -1265,7 +1260,6 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
/// to make sure the masks are simplified.
static void simplifyBlends(VPlan &Plan) {
- using namespace llvm::VPlanPatternMatch;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
@@ -1389,7 +1383,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
// Currently only handle cases where the single user is a header-mask
// comparison with the backedge-taken-count.
- using namespace VPlanPatternMatch;
if (!match(
*WideIV->user_begin(),
m_Binary<Instruction::ICmp>(
@@ -1420,7 +1413,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
ElementCount BestVF, unsigned BestUF,
ScalarEvolution &SE) {
- using namespace llvm::VPlanPatternMatch;
if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
&SE](VPValue *C) {
@@ -1460,7 +1452,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
auto *Term = &ExitingVPBB->back();
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
- using namespace llvm::VPlanPatternMatch;
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
match(Term, m_BranchOnCond(
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
@@ -1843,7 +1834,6 @@ void VPlanTransforms::truncateToMinimalBitwidths(
if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
VPW->dropPoisonGeneratingFlags();
- using namespace llvm::VPlanPatternMatch;
if (OldResSizeInBits != NewResSizeInBits &&
!match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
// Extend result to original width.
@@ -1893,7 +1883,6 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
- using namespace llvm::VPlanPatternMatch;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
VPValue *Cond;
@@ -2139,7 +2128,6 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
- using namespace llvm::VPlanPatternMatch;
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
@@ -2219,7 +2207,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : *VPBB) {
- using namespace VPlanPatternMatch;
VPValue *V1, *V2;
if (!match(&R,
m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
@@ -2240,6 +2227,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// Try to optimize header mask recipes away to their EVL variants.
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
+ // TODO: Split optimizeMaskToEVL out and move into
+ // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
+ // tryToBuildVPlanWithVPRecipes beforehand.
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
VPRecipeBase *EVLRecipe =
@@ -2261,6 +2251,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
}
+
+ // Replace header masks with a mask equivalent to predicating by EVL:
+ //
+ // icmp ule widen-canonical-iv backedge-taken-count
+ // ->
+ // icmp ult step-vector, EVL
+ VPRecipeBase *EVLR = EVL.getDefiningRecipe();
+ VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
+ Type *EVLType = TypeInfo.inferScalarType(&EVL);
+ VPValue *EVLMask = Builder.createICmp(
+ CmpInst::ICMP_ULT,
+ Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
+ HeaderMask->replaceAllUsesWith(EVLMask);
+ ToErase.push_back(HeaderMask->getDefiningRecipe());
}
for (VPRecipeBase *R : reverse(ToErase)) {
@@ -2369,6 +2373,65 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
return true;
}
+void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
+ // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
+ // There should be only one EVL PHI in the entire plan.
+ VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry())))
+ for (VPRecipeBase &R : VPBB->phis())
+ if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
+ assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
+ EVLPhi = PhiR;
+ }
+
+ // Early return if no EVL PHI is found.
+ if (!EVLPhi)
+ return;
+
+ VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
+ VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+
+ // Convert EVLPhi to concrete recipe.
+ auto *ScalarR =
+ VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
+ EVLPhi->getDebugLoc(), "evl.based.iv");
+ EVLPhi->replaceAllUsesWith(ScalarR);
+ EVLPhi->eraseFromParent();
+
+ // Replace CanonicalIVInc with EVL-PHI increment.
+ auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+ VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+ assert(match(Backedge,
+ m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV),
+ m_Specific(&Plan.getVFxUF()))) &&
+ "Unexpected canonical iv");
+ Backedge->replaceAllUsesWith(EVLIncrement);
+
+ // Remove unused phi and increment.
+ VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+ CanonicalIVIncrement->eraseFromParent();
+ CanonicalIV->eraseFromParent();
+
+ // Replace the use of VectorTripCount in the latch-exiting block.
+ // Before: (branch-on-count EVLIVInc, VectorTripCount)
+ // After: (branch-on-count EVLIVInc, TripCount)
+
+ VPBasicBlock *LatchExiting =
+ HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
+ auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+ // Skip single-iteration loop region
+ if (match(LatchExitingBr, m_BranchOnCond(m_True())))
+ return;
+ assert(LatchExitingBr &&
+ match(LatchExitingBr,
+ m_BranchOnCount(m_VPValue(EVLIncrement),
+ m_Specific(&Plan.getVectorTripCount()))) &&
+ "Unexpected terminator in EVL loop");
+ LatchExitingBr->setOperand(1, Plan.getTripCount());
+}
+
void VPlanTransforms::dropPoisonGeneratingRecipes(
VPlan &Plan,
const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
@@ -2399,7 +2462,6 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
// drop them directly.
if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
VPValue *A, *B;
- using namespace llvm::VPlanPatternMatch;
// Dropping disjoint from an OR may yield incorrect results, as some
// analysis may have converted it to an Add implicitly (e.g. SCEV used
// for dependence analysis). Instead, replace it with an equivalent Add.
@@ -2693,22 +2755,11 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
- using namespace llvm::VPlanPatternMatch;
-
VPTypeAnalysis TypeInfo(&CanonicalIVTy);
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
- auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
- {PhiR->getStartValue(), PhiR->getBackedgeValue()},
- PhiR->getDebugLoc(), "evl.based.iv");
- PhiR->replaceAllUsesWith(ScalarR);
- ToRemove.push_back(PhiR);
- continue;
- }
-
if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
ToRemove.push_back(WidenIVR);
@@ -2780,8 +2831,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
void VPlanTransforms::handleUncountableEarlyExit(
VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
- using namespace llvm::VPlanPatternMatch;
-
VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
if (!EarlyExitVPBB->getSinglePredecessor() &&
EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -2875,8 +2924,6 @@ void VPlanTransforms::handleUncountableEarlyExit(
static VPExpressionRecipe *
tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
VFRange &Range) {
- using namespace VPlanPatternMatch;
-
Type *RedTy = Ctx.Types.inferScalarType(Red);
VPValue *VecOp = Red->getVecOp();
@@ -2922,8 +2969,6 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
- using namespace VPlanPatternMatch;
-
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
if (Opcode != Instruction::Add)
return nullptr;
@@ -3137,9 +3182,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
- return IR->getInterleaveGroup()->getFactor() ==
- IR->getInterleaveGroup()->getNumMembers() &&
- IR->getVPValue(Idx) == OpV;
+ return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
return false;
}
@@ -3186,7 +3229,6 @@ static bool isAlreadyNarrow(VPValue *VPV) {
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
- using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
if (VF.isScalable() || !VectorLoop)
return;
@@ -3256,9 +3298,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (!DefR)
return false;
auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
- return IR &&
- IR->getInterleaveGroup()->getFactor() ==
- IR->getInterleaveGroup()->getNumMembers() &&
+ return IR && IR->getInterleaveGroup()->isFull() &&
IR->getVPValue(Op.index()) == Op.value();
})) {
StoreGroups.push_back(InterleaveR);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index d5af6cd..880159f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -209,6 +209,18 @@ struct VPlanTransforms {
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
+ /// Transform EVL loops to use variable-length stepping after region
+ /// dissolution.
+ ///
+ /// Once loop regions are replaced with explicit CFG, EVL loops can step with
+ /// variable vector lengths instead of fixed lengths. This transformation:
+ /// * Makes EVL-Phi concrete.
+ // * Removes CanonicalIV and increment.
+ /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
+ /// VectorTripCount) with variable-length stepping (branch-on-cond
+ /// EVLIVInc, TripCount).
+ static void canonicalizeEVLLoops(VPlan &Plan);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 38ada33..5fbff69 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -17,6 +17,7 @@
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "VPlanHelpers.h"
+#include "VPlanPatternMatch.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/TypeSwitch.h"
@@ -78,9 +79,8 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
NumActiveLaneMaskPhiRecipes++;
- if (IsHeaderVPBB && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(*RecipeI) &&
- !isa<VPInstruction>(*RecipeI) &&
- cast<VPInstruction>(RecipeI)->getOpcode() == Instruction::PHI) {
+ if (IsHeaderVPBB &&
+ !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPPhi>(*RecipeI)) {
errs() << "Found non-header PHI recipe in header VPBB";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
errs() << ": ";
@@ -171,7 +171,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
.Case<VPInstruction>([&](const VPInstruction *I) {
- if (I->getOpcode() == Instruction::PHI)
+ if (I->getOpcode() == Instruction::PHI ||
+ I->getOpcode() == Instruction::ICmp)
return VerifyEVLUse(*I, 1);
switch (I->getOpcode()) {
case Instruction::Add:
@@ -192,7 +193,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
errs() << "EVL used by unexpected VPInstruction\n";
return false;
}
- if (I->getNumUsers() != 1) {
+ // EVLIVIncrement is only used by EVLIV & BranchOnCount.
+ // Having more than two users is unexpected.
+ if ((I->getNumUsers() != 1) &&
+ (I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) {
+ using namespace llvm::VPlanPatternMatch;
+ return match(U, m_BranchOnCount(m_Specific(I), m_VPValue()));
+ }))) {
errs() << "EVL is used in VPInstruction with multiple users\n";
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6252f4f..6345b18 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1664,6 +1664,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
// store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) {
+ if (!TTI.allowVectorElementIndexingUsingGEP())
+ return false;
auto *SI = cast<StoreInst>(&I);
if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
return false;
@@ -1719,6 +1721,9 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
/// Try to scalarize vector loads feeding extractelement instructions.
bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
+ if (!TTI.allowVectorElementIndexingUsingGEP())
+ return false;
+
Value *Ptr;
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
@@ -1827,6 +1832,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
}
bool VectorCombine::scalarizeExtExtract(Instruction &I) {
+ if (!TTI.allowVectorElementIndexingUsingGEP())
+ return false;
auto *Ext = dyn_cast<ZExtInst>(&I);
if (!Ext)
return false;