//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This file contains both AMDGPU target machine and the CodeGen pass builder. /// The AMDGPU target machine contains all of the hardware specific information /// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The /// CodeGen pass builder handles the pass pipeline for new pass manager. // //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" #include "AMDGPUResourceUsageAnalysis.h" #include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" #include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" #include "GCNNSAReassign.h" #include "GCNPreRALongBranchReg.h" #include "GCNPreRAOptimizations.h" #include "GCNRewritePartialRegUses.h" #include "GCNSchedStrategy.h" #include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIFixSGPRCopies.h" #include "SIFixVGPRCopies.h" #include "SIFoldOperands.h" #include "SIFormMemoryClauses.h" #include "SILoadStoreOptimizer.h" #include "SILowerControlFlow.h" #include "SILowerSGPRSpills.h" #include "SILowerWWMCopies.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "SIOptimizeExecMasking.h" #include "SIOptimizeExecMaskingPreRA.h" #include "SIOptimizeVGPRLiveRange.h" #include "SIPeepholeSDWA.h" #include "SIPostRABundler.h" #include "SIPreAllocateWWMRegs.h" #include "SIShrinkInstructions.h" #include "SIWholeQuadMode.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/BranchRelaxation.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/MachineLICM.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/PostRAHazardRecognizer.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/Transforms/Scalar/StructurizeCFG.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/FixIrreducible.h" #include "llvm/Transforms/Utils/LCSSA.h" #include "llvm/Transforms/Utils/LowerSwitch.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include using namespace llvm; using namespace llvm::PatternMatch; namespace { class SGPRRegisterRegAlloc : public RegisterRegAllocBase { public: SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) : RegisterRegAllocBase(N, D, C) {} }; class VGPRRegisterRegAlloc : public RegisterRegAllocBase { public: VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) : RegisterRegAllocBase(N, D, C) {} }; class WWMRegisterRegAlloc : public RegisterRegAllocBase { public: WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) : RegisterRegAllocBase(N, D, C) {} }; static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg) { const TargetRegisterClass *RC = MRI.getRegClass(Reg); return static_cast(TRI).isSGPRClass(RC); } static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg) { const TargetRegisterClass *RC = MRI.getRegClass(Reg); return !static_cast(TRI).isSGPRClass(RC); } static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg) { const SIMachineFunctionInfo *MFI = MRI.getMF().getInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); return !static_cast(TRI).isSGPRClass(RC) && MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); } /// -{sgpr|wwm|vgpr}-regalloc=... command line option. static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag; static SGPRRegisterRegAlloc defaultSGPRRegAlloc("default", "pick SGPR register allocator based on -O option", useDefaultRegisterAllocator); static cl::opt> SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use for SGPRs")); static cl::opt> VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use for VGPRs")); static cl::opt> WWMRegAlloc("wwm-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use for WWM registers")); static void initializeDefaultSGPRRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); if (!Ctor) { Ctor = SGPRRegAlloc; SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); } } static void initializeDefaultVGPRRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); if (!Ctor) { Ctor = VGPRRegAlloc; VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); } } static void initializeDefaultWWMRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); if (!Ctor) { Ctor = WWMRegAlloc; WWMRegisterRegAlloc::setDefault(WWMRegAlloc); } } static FunctionPass *createBasicSGPRRegisterAllocator() { return createBasicRegisterAllocator(onlyAllocateSGPRs); } static FunctionPass *createGreedySGPRRegisterAllocator() { return createGreedyRegisterAllocator(onlyAllocateSGPRs); } static FunctionPass *createFastSGPRRegisterAllocator() { return createFastRegisterAllocator(onlyAllocateSGPRs, false); } static FunctionPass *createBasicVGPRRegisterAllocator() { return createBasicRegisterAllocator(onlyAllocateVGPRs); } static FunctionPass *createGreedyVGPRRegisterAllocator() { return createGreedyRegisterAllocator(onlyAllocateVGPRs); } static FunctionPass *createFastVGPRRegisterAllocator() { return createFastRegisterAllocator(onlyAllocateVGPRs, true); } static FunctionPass *createBasicWWMRegisterAllocator() { return createBasicRegisterAllocator(onlyAllocateWWMRegs); } static FunctionPass *createGreedyWWMRegisterAllocator() { return createGreedyRegisterAllocator(onlyAllocateWWMRegs); } static FunctionPass *createFastWWMRegisterAllocator() { return createFastRegisterAllocator(onlyAllocateWWMRegs, false); } static SGPRRegisterRegAlloc basicRegAllocSGPR( "basic", "basic register allocator", createBasicSGPRRegisterAllocator); static SGPRRegisterRegAlloc greedyRegAllocSGPR( "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); static SGPRRegisterRegAlloc fastRegAllocSGPR( "fast", "fast register allocator", createFastSGPRRegisterAllocator); static VGPRRegisterRegAlloc basicRegAllocVGPR( "basic", "basic register allocator", createBasicVGPRRegisterAllocator); static VGPRRegisterRegAlloc greedyRegAllocVGPR( "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); static VGPRRegisterRegAlloc fastRegAllocVGPR( "fast", "fast register allocator", createFastVGPRRegisterAllocator); static WWMRegisterRegAlloc basicRegAllocWWMReg("basic", "basic register allocator", createBasicWWMRegisterAllocator); static WWMRegisterRegAlloc greedyRegAllocWWMReg("greedy", "greedy register allocator", createGreedyWWMRegisterAllocator); static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator", createFastWWMRegisterAllocator); static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { return Phase == ThinOrFullLTOPhase::FullLTOPreLink || Phase == ThinOrFullLTOPhase::ThinLTOPreLink; } } // anonymous namespace static cl::opt EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false)); static cl::opt OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true)); static cl::opt LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden); // Option to disable vectorizer for tests. static cl::opt EnableLoadStoreVectorizer( "amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden); // Option to control global loads scalarization static cl::opt ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden); // Option to run internalize pass. static cl::opt InternalizeSymbols( "amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden); // Option to inline all early. static cl::opt EarlyInlineAll( "amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden); static cl::opt RemoveIncompatibleFunctions( "amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true)); static cl::opt EnableSDWAPeephole( "amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true)); static cl::opt EnableDPPCombine( "amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true)); // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true)); // Enable lib calls simplifications static cl::opt EnableLibCallSimplify( "amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden); static cl::opt EnableLowerKernelArguments( "amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden); static cl::opt EnableRegReassign( "amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden); static cl::opt OptVGPRLiveRange( "amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden); static cl::opt AMDGPUAtomicOptimizerStrategy( "amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), cl::values( clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", "Use Iterative approach for scan"), clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass( "amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden); // Enable GFX11+ s_delay_alu insertion static cl::opt EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); // Enable GFX11+ VOPD static cl::opt EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden); // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc")); static cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden); static cl::opt EnableScalarIRPasses( "amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden); static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " "and asan instrument resulting IR."), cl::init(true), cl::Hidden); static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden); static cl::opt EnablePreRAOptimizations( "amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden); static cl::opt EnablePromoteKernelArguments( "amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true)); static cl::opt EnableImageIntrinsicOptimizer( "amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); static cl::opt EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false)); static cl::opt AMDGPUSchedStrategy("amdgpu-sched-strategy", cl::desc("Select custom AMDGPU scheduling strategy."), cl::Hidden, cl::init("")); static cl::opt EnableRewritePartialRegUses( "amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden); static cl::opt EnableHipStdPar( "amdgpu-enable-hipstdpar", cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), cl::Hidden); static cl::opt EnableAMDGPUAttributor("amdgpu-attributor-enable", cl::desc("Enable AMDGPUAttributorPass"), cl::init(true), cl::Hidden); static cl::opt NewRegBankSelect( "new-reg-bank-select", cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of " "regbankselect"), cl::init(false), cl::Hidden); static cl::opt HasClosedWorldAssumption( "amdgpu-link-time-closed-world", cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); RegisterTargetMachine Y(getTheGCNTarget()); PassRegistry *PR = PassRegistry::getPassRegistry(); initializeR600ClauseMergePassPass(*PR); initializeR600ControlFlowFinalizerPass(*PR); initializeR600PacketizerPass(*PR); initializeR600ExpandSpecialInstrsPassPass(*PR); initializeR600VectorRegMergerPass(*PR); initializeR600EmitClauseMarkersPass(*PR); initializeR600MachineCFGStructurizerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUAsmPrinterPass(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); initializeSILowerI1CopiesLegacyPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); initializeAMDGPURegBankSelectPass(*PR); initializeAMDGPURegBankLegalizePass(*PR); initializeSILowerWWMCopiesLegacyPass(*PR); initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR); initializeSILowerSGPRSpillsLegacyPass(*PR); initializeSIFixSGPRCopiesLegacyPass(*PR); initializeSIFixVGPRCopiesLegacyPass(*PR); initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); initializeSIOptimizeExecMaskingPreRALegacyPass(*PR); initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPUReserveWWMRegsLegacyPass(*PR); initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); initializeSIInsertHardClausesLegacyPass(*PR); initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); initializeSIPreEmitPeepholeLegacyPass(*PR); initializeSILateBranchLoweringLegacyPass(*PR); initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); initializeSIPreAllocateWWMRegsLegacyPass(*PR); initializeSIFormMemoryClausesLegacyPass(*PR); initializeSIPostRABundlerLegacyPass(*PR); initializeGCNCreateVOPDLegacyPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); initializeAMDGPUImageIntrinsicOptimizerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR); initializeGCNNSAReassignLegacyPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); initializeGCNRewritePartialRegUsesLegacyPass(*PR); initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { return std::make_unique(); } static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { return new SIScheduleDAGMI(C); } static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; } static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs * createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive( C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; } static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); auto *DAG = new GCNIterativeScheduler( C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { auto *DAG = new GCNIterativeScheduler( C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler); static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry( "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause", createGCNMaxMemoryClauseMachineScheduler); static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( "gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler); static MachineSchedRegistry GCNMinRegSchedRegistry( "gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler); static MachineSchedRegistry GCNILPSchedRegistry( "gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler); static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; } // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. 160-bit non-integral fat buffer pointers that include a 128-bit // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values // (address space 7), and 128-bit non-integral buffer resourcees (address // space 8) which cannot be non-trivilally accessed by LLVM memory operations // like getelementptr. return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-" "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-" "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"; } LLVM_READNONE static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { if (!GPU.empty()) return GPU; // Need to default to a target with flat support for HSA. if (TT.isAMDGCN()) return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; return "r600"; } static Reloc::Model getEffectiveRelocModel(std::optional RM) { // The AMDGPU toolchain only supports generating shared objects, so we // must always use PIC. return Reloc::PIC_; } AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional RM, std::optional CM, CodeGenOptLevel OptLevel) : CodeGenTargetMachineImpl( T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); if (TT.isAMDGCN()) { if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); } } bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); } StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { Attribute FSAttr = F.getFnAttribute("target-features"); return FSAttr.isValid() ? FSAttr.getValueAsString() : getTargetFeatureString(); } llvm::ScheduleDAGInstrs * AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const { const GCNSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = createSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } /// Predicate for Internalize pass. static bool mustPreserveGV(const GlobalValue &GV) { if (const Function *F = dyn_cast(&GV)) return F->isDeclaration() || F->getName().starts_with("__asan_") || F->getName().starts_with("__sanitizer_") || AMDGPU::isEntryFunctionCC(F->getCallingConv()); GV.removeDeadConstantUsers(); return !GV.use_empty(); } void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { AAM.registerFunctionAnalysis(); } static Expected parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { if (Params.empty()) return ScanOptions::Iterative; Params.consume_front("strategy="); auto Result = StringSwitch>(Params) .Case("dpp", ScanOptions::DPP) .Cases("iterative", "", ScanOptions::Iterative) .Case("none", ScanOptions::None) .Default(std::nullopt); if (Result) return *Result; return make_error("invalid parameter", inconvertibleErrorCode()); } Expected parseAMDGPUAttributorPassOptions(StringRef Params) { AMDGPUAttributorOptions Result; while (!Params.empty()) { StringRef ParamName; std::tie(ParamName, Params) = Params.split(';'); if (ParamName == "closed-world") { Result.IsClosedWorld = true; } else { return make_error( formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName) .str(), inconvertibleErrorCode()); } } return Result; } void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" PB.registerScalarOptimizerLateEPCallback( [](FunctionPassManager &FPM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) return; FPM.addPass(InferAddressSpacesPass()); }); PB.registerVectorizerEndEPCallback( [](FunctionPassManager &FPM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) return; FPM.addPass(InferAddressSpacesPass()); }); PB.registerPipelineEarlySimplificationEPCallback( [](ModulePassManager &PM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { if (!isLTOPreLink(Phase)) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } if (Level == OptimizationLevel::O0) return; PM.addPass(AMDGPUUnifyMetadataPass()); // We don't want to run internalization at per-module stage. if (InternalizeSymbols && !isLTOPreLink(Phase)) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); }); PB.registerPeepholeEPCallback( [](FunctionPassManager &FPM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) return; FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); }); PB.registerCGSCCOptimizerLateEPCallback( [this](CGSCCPassManager &PM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) return; FunctionPassManager FPM; // Add promote kernel arguments pass to the opt pipeline right before // infer address spaces which is needed to do actual address space // rewriting. if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && EnablePromoteKernelArguments) FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. FPM.addPass(InferAddressSpacesPass()); // This should run after inlining to have any chance of doing // anything, and before other cleanup optimizations. FPM.addPass(AMDGPULowerKernelAttributesPass()); if (Level != OptimizationLevel::O0) { // Promote alloca to vector before SROA and loop unroll. If we // manage to eliminate allocas before unroll we may choose to unroll // less. FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); } PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); // FIXME: Why is AMDGPUAttributor not in CGSCC? PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { AMDGPUAttributorOptions Opts; MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); } } }); PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // When we are using -fgpu-rdc, we can only run accelerator code // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (Level != OptimizationLevel::O0) { // We only want to run this with O2 or higher since inliner and SROA // don't run in O1. if (Level != OptimizationLevel::O1) { PM.addPass( createModuleToFunctionPassAdaptor(InferAddressSpacesPass())); } // Do we really need internalization in LTO? if (InternalizeSymbols) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } if (EnableAMDGPUAttributor) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; PM.addPass(AMDGPUAttributorPass( *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink)); } } if (!NoKernelInfoEndLTO) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } }); PB.registerRegClassFilterParsingCallback( [](StringRef FilterName) -> RegAllocFilterFunc { if (FilterName == "sgpr") return onlyAllocateSGPRs; if (FilterName == "vgpr") return onlyAllocateVGPRs; if (FilterName == "wwm") return onlyAllocateWWMRegs; return nullptr; }); } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0; } bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && AMDGPU::isFlatGlobalAddrSpace(DestAS); } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { if (auto *Arg = dyn_cast(V); Arg && AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) && !Arg->hasByRefAttr()) return AMDGPUAS::GLOBAL_ADDRESS; const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; // It must be a generic pointer loaded. assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); const auto *Ptr = LD->getPointerOperand(); if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; // For a generic pointer loaded from the constant memory, it could be assumed // as a global pointer since the constant memory is only populated on the // host side. As implied by the offload programming model, only global // pointers could be referenced on the host side. return AMDGPUAS::GLOBAL_ADDRESS; } std::pair AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { if (auto *II = dyn_cast(V)) { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_is_shared: return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); case Intrinsic::amdgcn_is_private: return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); default: break; } return std::pair(nullptr, -1); } // Check the global pointer predication based on // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and // the order of 'is_shared' and 'is_private' is not significant. Value *Ptr; if (match( const_cast(V), m_c_And(m_Not(m_Intrinsic(m_Value(Ptr))), m_Not(m_Intrinsic( m_Deferred(Ptr)))))) return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); return std::pair(nullptr, -1); } unsigned AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { switch (Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: return AMDGPUAS::PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: return AMDGPUAS::CONSTANT_ADDRESS; } return AMDGPUAS::FLAT_ADDRESS; } bool AMDGPUTargetMachine::splitModule( Module &M, unsigned NumParts, function_ref MPart)> ModuleCallback) { // FIXME(?): Would be better to use an already existing Analysis/PassManager, // but all current users of this API don't have one ready and would need to // create one anyway. Let's hide the boilerplate for now to keep it simple. LoopAnalysisManager LAM; FunctionAnalysisManager FAM; CGSCCAnalysisManager CGAM; ModuleAnalysisManager MAM; PassBuilder PB(this); PB.registerModuleAnalyses(MAM); PB.registerFunctionAnalyses(FAM); PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); ModulePassManager MPM; MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback)); MPM.run(M, MAM); return true; } //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional RM, std::optional CM, CodeGenOptLevel OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} const TargetSubtargetInfo * GCNTargetMachine::getSubtargetImpl(const Function &F) const { StringRef GPU = getGPUName(F); StringRef FS = getFeatureString(F); SmallString<128> SubtargetKey(GPU); SubtargetKey.append(FS); auto &I = SubtargetMap[SubtargetKey]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); I = std::make_unique(TargetTriple, GPU, FS, *this); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); return I.get(); } TargetTransformInfo GCNTargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(std::make_unique(this, F)); } Error GCNTargetMachine::buildCodeGenPipeline( ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) { AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC); return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } ScheduleDAGInstrs * GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const { const GCNSubtarget &ST = C->MF->getSubtarget(); if (ST.enableSIScheduler()) return createSIMachineScheduler(C); Attribute SchedStrategyAttr = C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy"); StringRef SchedStrategy = SchedStrategyAttr.isValid() ? SchedStrategyAttr.getValueAsString() : AMDGPUSchedStrategy; if (SchedStrategy == "max-ilp") return createGCNMaxILPMachineScheduler(C); if (SchedStrategy == "max-memory-clause") return createGCNMaxMemoryClauseMachineScheduler(C); if (SchedStrategy == "iterative-ilp") return createIterativeILPMachineScheduler(C); if (SchedStrategy == "iterative-minreg") return createMinRegScheduler(C); if (SchedStrategy == "iterative-maxocc") return createIterativeGCNMaxOccupancyMachineScheduler(C); return createGCNMaxOccupancyMachineScheduler(C); } ScheduleDAGInstrs * GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(C, std::make_unique(C), /*RemoveKillFlags=*/true); const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); if ((EnableVOPD.getNumOccurrences() || getOptLevel() >= CodeGenOptLevel::Less) && EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; } //===----------------------------------------------------------------------===// // AMDGPU Legacy Pass Setup //===----------------------------------------------------------------------===// std::unique_ptr llvm::AMDGPUPassConfig::getCSEConfig() const { return getStandardCSEConfigForOpt(TM->getOptLevel()); } namespace { class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { // It is necessary to know the register usage of the entire call graph. We // allow calls without EnableAMDGPUFunctionCalls if they are marked // noinline, so this is always required. setRequiresCodeGenSCCOrder(true); substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } GCNTargetMachine &getGCNTargetMachine() const { return getTM(); } bool addPreISel() override; void addMachineSSAOptimization() override; bool addILPOpts() override; bool addInstSelector() override; bool addIRTranslator() override; void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; FunctionPass *createSGPRAllocPass(bool Optimized); FunctionPass *createVGPRAllocPass(bool Optimized); FunctionPass *createWWMRegAllocPass(bool Optimized); FunctionPass *createRegAllocPass(bool Optimized) override; bool addRegAssignAndRewriteFast() override; bool addRegAssignAndRewriteOptimized() override; bool addPreRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; void addPostBBSections() override; }; } // end anonymous namespace AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // Exceptions and StackMaps are not supported, so these passes will never do // anything. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); // Garbage collection is not supported. disablePass(&GCLoweringID); disablePass(&ShadowStackGCLoweringID); } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { if (getOptLevel() == CodeGenOptLevel::Aggressive) addPass(createGVNPass()); else addPass(createEarlyCSEPass()); } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) addPass(createLoopDataPrefetchPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. addPass(createStraightLineStrengthReducePass()); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or // EarlyCSE can reuse. addEarlyCSEOrGVNPass(); // Run NaryReassociate after EarlyCSE/GVN to be more effective. addPass(createNaryReassociatePass()); // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. addPass(createEarlyCSEPass()); } void AMDGPUPassConfig::addIRPasses() { const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM)); // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); addPass(createAMDGPUPrintfRuntimeBinding()); if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) { addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); } // Run atomic optimizer before Atomic Expand if ((TM.getTargetTriple().isAMDGCN()) && (TM.getOptLevel() >= CodeGenOptLevel::Less) && (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); } addPass(createAtomicExpandLegacyPass()); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(createAMDGPUPromoteAlloca()); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(); if (EnableAMDGPUAliasAnalysis) { addPass(createAMDGPUAAWrapperPass()); addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { if (auto *WrapperPass = P.getAnalysisIfAvailable()) AAR.addAAResult(WrapperPass->getResult()); })); } if (TM.getTargetTriple().isAMDGCN()) { // TODO: May want to move later or split into an early and late one. addPass(createAMDGPUCodeGenPreparePass()); } // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may // have expanded. if (TM.getOptLevel() > CodeGenOptLevel::Less) addPass(createLICMPass()); } TargetPassConfig::addIRPasses(); // EarlyCSE is not always strong enough to clean up what LSR produces. For // example, GVN can combine // // %0 = add %a, %b // %1 = add %b, %a // // and // // %0 = shl nsw %a, 2 // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. if (isPassEnabled(EnableScalarIRPasses)) addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && TM->getOptLevel() > CodeGenOptLevel::None) addPass(createAMDGPUPreloadKernelArgumentsLegacyPass(TM)); if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis // that it changes by splitting up fat pointers into their components) // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. // // FIXME: This should ideally be put after the LoadStoreVectorizer. // However, due to some annoying facts about ResourceUsageAnalysis, // (especially as exercised in the resource-usage-dead-function test), // we need all the function passes codegenprepare all the way through // said resource usage analysis to run on the call graph produced // before codegenprepare runs (because codegenprepare will knock some // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); // In accordance with the above FIXME, manually force all the // function-level passes into a CGSCCPassManager. addPass(new DummyCGSCCPass()); } TargetPassConfig::addCodeGenPrepare(); if (isPassEnabled(EnableLoadStoreVectorizer)) addPass(createLoadStoreVectorizerPass()); // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by // UnreachableBlockElim inserted next in the pass flow. addPass(createLowerSwitchPass()); } bool AMDGPUPassConfig::addPreISel() { if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createFlattenCFGPass()); return false; } bool AMDGPUPassConfig::addInstSelector() { addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); return false; } bool AMDGPUPassConfig::addGCPasses() { // Do nothing. GC is not supported. return false; } //===----------------------------------------------------------------------===// // GCN Legacy Pass Setup //===----------------------------------------------------------------------===// bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createSinkingPass()); if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createAMDGPULateCodeGenPrepareLegacyPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); addPass(createFixIrreduciblePass()); addPass(createUnifyLoopExitsPass()); addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions addPass(createAMDGPUAnnotateUniformValuesLegacy()); addPass(createSIAnnotateControlFlowLegacyPass()); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making // control flow modifications. addPass(createAMDGPURewriteUndefForPHILegacyPass()); // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel // with -new-reg-bank-select and without any of the fallback options. if (!getCGPassBuilderOption().EnableGlobalISelOption || !isGlobalISelAbortEnabled() || !NewRegBankSelect) addPass(createLCSSAPass()); if (TM->getOptLevel() > CodeGenOptLevel::Less) addPass(&AMDGPUPerfHintAnalysisLegacyID); return false; } void GCNPassConfig::addMachineSSAOptimization() { TargetPassConfig::addMachineSSAOptimization(); // We want to fold operands after PeepholeOptimizer has run (or as part of // it), because it will eliminate extra copies making it easier to fold the // real source operand. We want to eliminate dead instructions after, so that // we see fewer uses of the copies. We then need to clean up the dead // instructions leftover after the operands are folded as well. // // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsLegacyID); if (EnableDPPCombine) addPass(&GCNDPPCombineLegacyID); addPass(&SILoadStoreOptimizerLegacyID); if (isPassEnabled(EnableSDWAPeephole)) { addPass(&SIPeepholeSDWALegacyID); addPass(&EarlyMachineLICMID); addPass(&MachineCSELegacyID); addPass(&SIFoldOperandsLegacyID); } addPass(&DeadMachineInstructionElimID); addPass(createSIShrinkInstructionsLegacyPass()); } bool GCNPassConfig::addILPOpts() { if (EnableEarlyIfConversion) addPass(&EarlyIfConverterLegacyID); TargetPassConfig::addILPOpts(); return false; } bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesLegacyID); addPass(createSILowerI1CopiesLegacyPass()); return false; } bool GCNPassConfig::addIRTranslator() { addPass(new IRTranslator(getOptLevel())); return false; } void GCNPassConfig::addPreLegalizeMachineIR() { bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); addPass(new Localizer()); } bool GCNPassConfig::addLegalizeMachineIR() { addPass(new Legalizer()); return false; } void GCNPassConfig::addPreRegBankSelect() { bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); addPass(createAMDGPUGlobalISelDivergenceLoweringPass()); } bool GCNPassConfig::addRegBankSelect() { if (NewRegBankSelect) { addPass(createAMDGPURegBankSelectPass()); addPass(createAMDGPURegBankLegalizePass()); } else { addPass(new RegBankSelect()); } return false; } void GCNPassConfig::addPreGlobalInstructionSelect() { bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPURegBankCombiner(IsOptNone)); } bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect(getOptLevel())); return false; } void GCNPassConfig::addFastRegAlloc() { // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); TargetPassConfig::addFastRegAlloc(); } void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); // FIXME: when an instruction has a Killed operand, and the instruction is // inside a bundle, seems only the BUNDLE instruction appears as the Kills of // the register in LiveVariables, this would trigger a failure in verifier, // we should fix it and enable the verifier. if (OptVGPRLiveRange) insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); if (EnableRewritePartialRegUses) insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); if (isPassEnabled(EnablePreRAOptimizations)) insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID); // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation // instructions that cause scheduling barriers. insertPass(&MachineSchedulerID, &SIWholeQuadModeID); if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); TargetPassConfig::addOptimizedRegAlloc(); } bool GCNPassConfig::addPreRewrite() { if (EnableRegReassign) addPass(&GCNNSAReassignID); addPass(&AMDGPURewriteAGPRCopyMFMALegacyID); return true; } FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { // Initialize the global default. llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, initializeDefaultSGPRRegisterAllocatorOnce); RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); if (Ctor != useDefaultRegisterAllocator) return Ctor(); if (Optimized) return createGreedyRegisterAllocator(onlyAllocateSGPRs); return createFastRegisterAllocator(onlyAllocateSGPRs, false); } FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { // Initialize the global default. llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, initializeDefaultVGPRRegisterAllocatorOnce); RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); if (Ctor != useDefaultRegisterAllocator) return Ctor(); if (Optimized) return createGreedyVGPRRegisterAllocator(); return createFastVGPRRegisterAllocator(); } FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) { // Initialize the global default. llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag, initializeDefaultWWMRegisterAllocatorOnce); RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); if (Ctor != useDefaultRegisterAllocator) return Ctor(); if (Optimized) return createGreedyWWMRegisterAllocator(); return createFastWWMRegisterAllocator(); } FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { llvm_unreachable("should not be used"); } static const char RegAllocOptNotSupportedMessage[] = "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, " "and -vgpr-regalloc"; bool GCNPassConfig::addRegAssignAndRewriteFast() { if (!usingDefaultRegAlloc()) reportFatalUsageError(RegAllocOptNotSupportedMessage); addPass(&GCNPreRALongBranchRegID); addPass(createSGPRAllocPass(false)); // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsLegacyID); // To Allocate wwm registers used in whole quad mode operations (for shaders). addPass(&SIPreAllocateWWMRegsLegacyID); // For allocating other wwm register operands. addPass(createWWMRegAllocPass(false)); addPass(&SILowerWWMCopiesLegacyID); addPass(&AMDGPUReserveWWMRegsLegacyID); // For allocating per-thread VGPRs. addPass(createVGPRAllocPass(false)); return true; } bool GCNPassConfig::addRegAssignAndRewriteOptimized() { if (!usingDefaultRegAlloc()) reportFatalUsageError(RegAllocOptNotSupportedMessage); addPass(&GCNPreRALongBranchRegID); addPass(createSGPRAllocPass(true)); // Commit allocated register changes. This is mostly necessary because too // many things rely on the use lists of the physical registers, such as the // verifier. This is only necessary with allocators which use LiveIntervals, // since FastRegAlloc does the replacements itself. addPass(createVirtRegRewriter(false)); // At this point, the sgpr-regalloc has been done and it is good to have the // stack slot coloring to try to optimize the SGPR spill stack indices before // attempting the custom SGPR spill lowering. addPass(&StackSlotColoringID); // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsLegacyID); // To Allocate wwm registers used in whole quad mode operations (for shaders). addPass(&SIPreAllocateWWMRegsLegacyID); // For allocating other whole wave mode registers. addPass(createWWMRegAllocPass(true)); addPass(&SILowerWWMCopiesLegacyID); addPass(createVirtRegRewriter(false)); addPass(&AMDGPUReserveWWMRegsLegacyID); // For allocating per-thread VGPRs. addPass(createVGPRAllocPass(true)); addPreRewrite(); addPass(&VirtRegRewriterID); addPass(&AMDGPUMarkLastScratchLoadID); return true; } void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIOptimizeExecMaskingLegacyID); TargetPassConfig::addPostRegAlloc(); } void GCNPassConfig::addPreSched2() { if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createSIShrinkInstructionsLegacyPass()); addPass(&SIPostRABundlerLegacyID); } void GCNPassConfig::addPreEmitPass() { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIModeRegisterPass()); if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) addPass(createAMDGPUSetWavePriorityPass()); if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled // bottom up, so when we begin to schedule a region we don't know what // instructions were emitted directly before it. // // Here we add a stand-alone hazard recognizer pass which can handle all // cases. addPass(&PostRAHazardRecognizerID); addPass(&AMDGPUWaitSGPRHazardsLegacyID); if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); addPass(&BranchRelaxationPassID); } void GCNPassConfig::addPostBBSections() { // We run this later to avoid passes like livedebugvalues and BBSections // having to deal with the apparent multi-entry functions we may generate. addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } void GCNTargetMachine::registerMachineRegisterInfoCallback( MachineFunction &MF) const { SIMachineFunctionInfo *MFI = MF.getInfo(); MF.getRegInfo().addDelegate(MFI); } MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const { return SIMachineFunctionInfo::create( Allocator, F, static_cast(STI)); } yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { return new yaml::SIMachineFunctionInfo(); } yaml::MachineFunctionInfo * GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); return new yaml::SIMachineFunctionInfo( *MFI, *MF.getSubtarget().getRegisterInfo(), MF); } bool GCNTargetMachine::parseMachineFunctionInfo( const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const { const yaml::SIMachineFunctionInfo &YamlMFI = static_cast(MFI_); MachineFunction &MF = PFS.MF; SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) return true; if (MFI->Occupancy == 0) { // Fixup the subtarget dependent default value. MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second; } auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { Register TempReg; if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { SourceRange = RegName.SourceRange; return true; } RegVal = TempReg; return false; }; auto parseOptionalRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { return !RegName.Value.empty() && parseRegister(RegName, RegVal); }; if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) return true; if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, MFI->LongBranchReservedReg)) return true; auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, RegName.Value.size(), SourceMgr::DK_Error, "incorrect register class for field", RegName.Value, {}, {}); SourceRange = RegName.SourceRange; return true; }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); } if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); } for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { Register ParsedReg; if (parseRegister(YamlReg, ParsedReg)) return true; MFI->reserveWWMRegister(ParsedReg); } for (const auto &[_, Info] : PFS.VRegInfosNamed) { MFI->setFlag(Info->VReg, Info->Flags); } for (const auto &[_, Info] : PFS.VRegInfos) { MFI->setFlag(Info->VReg, Info->Flags); } for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) { Register ParsedReg; if (parseRegister(YamlRegStr, ParsedReg)) return true; MFI->SpillPhysVGPRs.push_back(ParsedReg); } auto parseAndCheckArgument = [&](const std::optional &A, const TargetRegisterClass &RC, ArgDescriptor &Arg, unsigned UserSGPRs, unsigned SystemSGPRs) { // Skip parsing if it's not present. if (!A) return false; if (A->IsRegister) { Register Reg; if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { SourceRange = A->RegisterName.SourceRange; return true; } if (!RC.contains(Reg)) return diagnoseRegisterClass(A->RegisterName); Arg = ArgDescriptor::createRegister(Reg); } else Arg = ArgDescriptor::createStack(A->StackOffset); // Check and apply the optional mask. if (A->Mask) Arg = ArgDescriptor::createArg(Arg, *A->Mask); MFI->NumUserSGPRs += UserSGPRs; MFI->NumSystemSGPRs += SystemSGPRs; return false; }; if (YamlMFI.ArgInfo && (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, AMDGPU::SGPR_128RegClass, MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.QueuePtr, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.KernargSegmentPtr, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, AMDGPU::SReg_64RegClass, MFI->ArgInfo.FlatScratchInit, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.PrivateSegmentSize, 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupInfo, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.ImplicitArgPtr, 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX, 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY, 0, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ, 0, 0))) return true; if (ST.hasIEEEMode()) MFI->Mode.IEEE = YamlMFI.Mode.IEEE; if (ST.hasDX10ClampMode()) MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; // FIXME: Move proper support for denormal-fp-math into base MachineFunction MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals ? DenormalMode::IEEE : DenormalMode::PreserveSign; MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals ? DenormalMode::IEEE : DenormalMode::PreserveSign; MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals ? DenormalMode::IEEE : DenormalMode::PreserveSign; MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals ? DenormalMode::IEEE : DenormalMode::PreserveSign; if (YamlMFI.HasInitWholeWave) MFI->setInitWholeWave(); return false; } //===----------------------------------------------------------------------===// // AMDGPU CodeGen Pass Builder interface. //===----------------------------------------------------------------------===// AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( GCNTargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) : CodeGenPassBuilder(TM, Opts, PIC) { Opt.MISchedPostRA = true; Opt.RequiresCodeGenSCCOrder = true; // Exceptions and StackMaps are not supported, so these passes will never do // anything. // Garbage collection is not supported. disablePass(); } void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); addPass(AMDGPUPrintfRuntimeBindingPass()); if (LowerCtorDtor) addPass(AMDGPUCtorDtorLoweringPass()); if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); addPass(AMDGPUAlwaysInlinePass()); addPass(AlwaysInlinerPass()); addPass(AMDGPUExportKernelRuntimeHandlesPass()); if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) addPass(AMDGPULowerModuleLDSPass(TM)); // Run atomic optimizer before Atomic Expand if (TM.getOptLevel() >= CodeGenOptLevel::Less && (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); addPass(AtomicExpandPass(&TM)); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(addPass); // TODO: Handle EnableAMDGPUAliasAnalysis // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); // TODO: LICM } Base::addIRPasses(addPass); // EarlyCSE is not always strong enough to clean up what LSR produces. For // example, GVN can combine // // %0 = add %a, %b // %1 = add %b, %a // // and // // %0 = shl nsw %a, 2 // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. if (isPassEnabled(EnableScalarIRPasses)) addEarlyCSEOrGVNPass(addPass); } void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(AMDGPUPreloadKernelArgumentsPass(TM)); if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis // that it changes by splitting up fat pointers into their components) // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. // // FIXME: This should ideally be put after the LoadStoreVectorizer. // However, due to some annoying facts about ResourceUsageAnalysis, // (especially as exercised in the resource-usage-dead-function test), // we need all the function passes codegenprepare all the way through // said resource usage analysis to run on the call graph produced // before codegenprepare runs (because codegenprepare will knock some // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); Base::addCodeGenPrepare(addPass); if (isPassEnabled(EnableLoadStoreVectorizer)) addPass(LoadStoreVectorizerPass()); // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the // pass flow. addPass(LowerSwitchPass()); } void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(FlattenCFGPass()); addPass(SinkingPass()); addPass(AMDGPULateCodeGenPreparePass(TM)); } // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(AMDGPUUnifyDivergentExitNodesPass()); addPass(FixIrreduciblePass()); addPass(UnifyLoopExitsPass()); addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); addPass(AMDGPUAnnotateUniformValuesPass()); addPass(SIAnnotateControlFlowPass(TM)); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making // control flow modifications. addPass(AMDGPURewriteUndefForPHIPass()); if (!getCGPassBuilderOption().EnableGlobalISelOption || !isGlobalISelAbortEnabled() || !NewRegBankSelect) addPass(LCSSAPass()); if (TM.getOptLevel() > CodeGenOptLevel::Less) addPass(AMDGPUPerfHintAnalysisPass(TM)); // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why // isn't this in addInstSelector? addPass(RequireAnalysisPass(), /*Force=*/true); } void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const { if (EnableEarlyIfConversion) addPass(EarlyIfConverterPass()); Base::addILPOpts(addPass); } void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, CreateMCStreamer) const { // TODO: Add AsmPrinter. } Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { addPass(AMDGPUISelDAGToDAGPass(TM)); addPass(SIFixSGPRCopiesPass()); addPass(SILowerI1CopiesPass()); return Error::success(); } void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { if (EnableRegReassign) { addPass(GCNNSAReassignPass()); } } void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( AddMachinePass &addPass) const { Base::addMachineSSAOptimization(addPass); addPass(SIFoldOperandsPass()); if (EnableDPPCombine) { addPass(GCNDPPCombinePass()); } addPass(SILoadStoreOptimizerPass()); if (isPassEnabled(EnableSDWAPeephole)) { addPass(SIPeepholeSDWAPass()); addPass(EarlyMachineLICMPass()); addPass(MachineCSEPass()); addPass(SIFoldOperandsPass()); } addPass(DeadMachineInstructionElimPass()); addPass(SIShrinkInstructionsPass()); } void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( AddMachinePass &addPass) const { if (EnableDCEInRA) insertPass(DeadMachineInstructionElimPass()); // FIXME: when an instruction has a Killed operand, and the instruction is // inside a bundle, seems only the BUNDLE instruction appears as the Kills of // the register in LiveVariables, this would trigger a failure in verifier, // we should fix it and enable the verifier. if (OptVGPRLiveRange) insertPass>( SIOptimizeVGPRLiveRangePass()); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(SILowerControlFlowPass()); if (EnableRewritePartialRegUses) insertPass(GCNRewritePartialRegUsesPass()); if (isPassEnabled(EnablePreRAOptimizations)) insertPass(GCNPreRAOptimizationsPass()); // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation // instructions that cause scheduling barriers. insertPass(SIWholeQuadModePass()); if (OptExecMaskPreRA) insertPass(SIOptimizeExecMaskingPreRAPass()); // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. if (TM.getOptLevel() > CodeGenOptLevel::Less) insertPass(SIFormMemoryClausesPass()); Base::addOptimizedRegAlloc(addPass); } Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { // TODO: Check --regalloc-npm option addPass(GCNPreRALongBranchRegPass()); addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"})); // Commit allocated register changes. This is mostly necessary because too // many things rely on the use lists of the physical registers, such as the // verifier. This is only necessary with allocators which use LiveIntervals, // since FastRegAlloc does the replacements itself. addPass(VirtRegRewriterPass(false)); // At this point, the sgpr-regalloc has been done and it is good to have the // stack slot coloring to try to optimize the SGPR spill stack indices before // attempting the custom SGPR spill lowering. addPass(StackSlotColoringPass()); // Equivalent of PEI for SGPRs. addPass(SILowerSGPRSpillsPass()); // To Allocate wwm registers used in whole quad mode operations (for shaders). addPass(SIPreAllocateWWMRegsPass()); // For allocating other wwm register operands. addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"})); addPass(SILowerWWMCopiesPass()); addPass(VirtRegRewriterPass(false)); addPass(AMDGPUReserveWWMRegsPass()); // For allocating per-thread VGPRs. addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"})); addPreRewrite(addPass); addPass(VirtRegRewriterPass(true)); addPass(AMDGPUMarkLastScratchLoadPass()); return Error::success(); } void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { addPass(SIFixVGPRCopiesPass()); if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(SIOptimizeExecMaskingPass()); Base::addPostRegAlloc(addPass); } void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); } addPass(SIMemoryLegalizerPass()); addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); if (TM.getOptLevel() > CodeGenOptLevel::None) { // TODO: addPass(SIInsertHardClausesPass()); } addPass(SILateBranchLoweringPass()); if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) addPass(AMDGPUSetWavePriorityPass()); if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(SIPreEmitPeepholePass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled // bottom up, so when we begin to schedule a region we don't know what // instructions were emitted directly before it. // // Here we add a stand-alone hazard recognizer pass which can handle all // cases. addPass(PostRAHazardRecognizerPass()); addPass(AMDGPUWaitSGPRHazardsPass()); if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) { addPass(AMDGPUInsertDelayAluPass()); } addPass(BranchRelaxationPass()); } bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt &Opt, CodeGenOptLevel Level) const { if (Opt.getNumOccurrences()) return Opt; if (TM.getOptLevel() < Level) return false; return Opt; } void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const { if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) addPass(GVNPass()); else addPass(EarlyCSEPass()); } void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses( AddIRPass &addPass) const { if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) addPass(LoopDataPrefetchPass()); addPass(SeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. addPass(StraightLineStrengthReducePass()); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or // EarlyCSE can reuse. addEarlyCSEOrGVNPass(addPass); // Run NaryReassociate after EarlyCSE/GVN to be more effective. addPass(NaryReassociatePass()); // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. addPass(EarlyCSEPass()); }