//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file declares the NVPTX specific subclass of TargetSubtarget. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H #include "NVPTX.h" #include "NVPTXFrameLowering.h" #include "NVPTXISelLowering.h" #include "NVPTXInstrInfo.h" #include "NVPTXRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/NVPTXAddrSpace.h" #include #define GET_SUBTARGETINFO_HEADER #include "NVPTXGenSubtargetInfo.inc" namespace llvm { class NVPTXSubtarget : public NVPTXGenSubtargetInfo { virtual void anchor(); std::string TargetName; // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned PTXVersion; // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310 // sm_90a == 901 unsigned int FullSmVersion; // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from // FullSmVersion. unsigned int SmVersion; NVPTXInstrInfo InstrInfo; NVPTXTargetLowering TLInfo; std::unique_ptr TSInfo; // NVPTX does not have any call stack frame, but need a NVPTX specific // FrameLowering class because TargetFrameLowering is abstract. NVPTXFrameLowering FrameLowering; public: /// This constructor initializes the data members to match that /// of the specified module. /// NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM); ~NVPTXSubtarget() override; const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; } const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } const NVPTXRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } const NVPTXTargetLowering *getTargetLowering() const override { return &TLInfo; } const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; bool has256BitVectorLoadStore(unsigned AS) const { return SmVersion >= 100 && PTXVersion >= 88 && AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; } bool hasAtomAddF64() const { return SmVersion >= 60; } bool hasAtomScope() const { return SmVersion >= 60; } bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; } bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } bool hasLDG() const { return SmVersion >= 32; } bool hasHWROT32() const { return SmVersion >= 32; } bool hasFP16Math() const { return SmVersion >= 53; } bool hasBF16Math() const { return SmVersion >= 80; } bool allowFP16Math() const; bool hasMaskOperator() const { return PTXVersion >= 71; } bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, // release, acq_rel, sc) ? bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } // Does SM & PTX support .acquire and .release qualifiers for fence? bool hasSplitAcquireAndReleaseFences() const { return SmVersion >= 90 && PTXVersion >= 86; } // Does SM & PTX support atomic relaxed MMIO operations ? bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } // Tcgen05 instructions in Blackwell family bool hasTcgen05Instructions() const { bool HasTcgen05 = false; unsigned MinPTXVersion = 86; switch (FullSmVersion) { default: break; case 1003: // sm_100a case 1013: // sm_101a HasTcgen05 = true; break; case 1103: // sm_110a HasTcgen05 = true; MinPTXVersion = 90; break; case 1033: // sm_103a HasTcgen05 = true; MinPTXVersion = 88; break; } return HasTcgen05 && PTXVersion >= MinPTXVersion; } bool hasTcgen05MMAScaleInputDImm() const { return FullSmVersion == 1003 && PTXVersion >= 86; } // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; // TMA G2S copy with cta_group::1/2 support bool hasCpAsyncBulkTensorCTAGroupSupport() const { // TODO: Update/tidy-up after the family-conditional support arrives switch (FullSmVersion) { case 1003: case 1013: return PTXVersion >= 86; case 1033: return PTXVersion >= 88; default: return false; } } // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the // block that's lexically below it. This would lead to a phantom CFG edges // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when // PTX ISA versions 8.3+ we can confidently say that the bug will not be // present. bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have architecture-accelerated features that are // supported on the specified architecture only, hence such targets do not // follow the onion layer model. hasArchAccelFeatures() allows distinguishing // such GPU variants from the base GPU architecture. // - false represents non-accelerated architecture. // - true represents architecture-accelerated variant. bool hasArchAccelFeatures() const { return (getFullSmVersion() & 1) && PTXVersion >= 80; } // GPUs with 'f' suffix have architecture-accelerated features which are // portable across all future architectures under same SM major. For example, // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. // - false represents non-family-specific architecture. // - true represents family-specific variant. bool hasFamilySpecificFeatures() const { return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 : hasArchAccelFeatures(); } // If the user did not provide a target we default to the `sm_30` target. std::string getTargetName() const { return TargetName.empty() ? "sm_30" : TargetName; } bool hasTargetName() const { return !TargetName.empty(); } bool hasNativeBF16Support(int Opcode) const; // Get maximum value of required alignments among the supported data types. // From the PTX ISA doc, section 8.2.3: // The memory consistency model relates operations executed on memory // locations with scalar data-types, which have a maximum size and alignment // of 64 bits. Memory operations with a vector data-type are modelled as a // set of equivalent memory operations with a scalar data-type, executed in // an unspecified order on the elements in the vector. unsigned getMaxRequiredAlignment() const { return 8; } // Get the smallest cmpxchg word size that the hardware supports. unsigned getMinCmpXchgSizeInBits() const { return 32; } unsigned getPTXVersion() const { return PTXVersion; } NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); void failIfClustersUnsupported(std::string const &FailureMessage) const; }; } // End llvm namespace #endif