diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 20 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp | 21 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 37 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 27 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 25 | 
9 files changed, 142 insertions, 14 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..aed325c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {  bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {    const DebugLoc &DL = I.getDebugLoc();    MachineBasicBlock *BB = I.getParent(); +  Register VCCReg = I.getOperand(1).getReg(); +  MachineInstr *Cmp; + +  if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { +    unsigned CmpOpc = +        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; +    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); +  } else { +    // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 +    // which sets SCC as a side effect. +    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); +    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) +              .addReg(VCCReg) +              .addReg(VCCReg); +  } -  unsigned CmpOpc = -      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; -  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) -                          .addReg(I.getOperand(1).getReg()) -                          .addImm(0);    if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))      return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e187959..907f830 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@  #include "llvm/CodeGen/GlobalISel/CSEInfo.h"  #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"  #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"  #include "llvm/CodeGen/GlobalISel/Utils.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@  using namespace llvm;  using namespace AMDGPU; +using namespace llvm::MIPatternMatch;  namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { +  return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} +  class AMDGPURegBankLegalize : public MachineFunctionPass {  public:    static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {  Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {    // Src = G_AMDGPU_READANYLANE RALSrc -  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); -  if (RAL) +  Register RALSrc; +  if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))      return RALSrc; +  // TruncSrc = G_AMDGPU_READANYLANE RALSrc +  // AextSrc = G_TRUNC TruncSrc +  // Src = G_ANYEXT AextSrc +  if (mi_match(Src, MRI, +               m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { +    return RALSrc; +  } +    // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc    // LoSgpr = G_AMDGPU_READANYLANE LoVgpr    // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..dc8fa7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {    MI.eraseFromParent();  } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { +  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); +  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); +  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); +  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); +  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), +                          {ResLo.getReg(0), ResHi.getReg(0)}); +  MI.eraseFromParent(); +} +  static bool isSignedBFE(MachineInstr &MI) {    if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))      return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -616,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {    MI.eraseFromParent();  } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { +  Register Dst = MI.getOperand(0).getReg(); +  assert(MRI.getType(Dst) == V2S16); +  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); +  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); +  unsigned Opc = MI.getOpcode(); +  auto Flags = MI.getFlags(); +  auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); +  auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); +  auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); +  auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); +  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); +  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); +  B.buildMergeLikeInstr(Dst, {Lo, Hi}); +  MI.eraseFromParent(); +} +  void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {    Register Dst = MI.getOperand(0).getReg();    LLT DstTy = MRI.getType(Dst); @@ -688,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,      return lowerUnpackBitShift(MI);    case UnpackMinMax:      return lowerUnpackMinMax(MI); +  case ScalarizeToS16: +    return lowerSplitTo16(MI);    case Ext32To64: {      const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());      MachineInstrBuilder Hi; @@ -804,6 +833,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,      }      break;    } +  case UnpackAExt: +    return lowerUnpackAExt(MI);    case WidenMMOToS32:      return widenMMOToS32(cast<GAnyLoad>(MI));    } @@ -837,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {      return LLT::scalar(32);    case Sgpr64:    case Vgpr64: +  case UniInVgprS64:      return LLT::scalar(64);    case Sgpr128:    case Vgpr128: @@ -960,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {    case UniInVcc:    case UniInVgprS16:    case UniInVgprS32: +  case UniInVgprS64:    case UniInVgprV2S16:    case UniInVgprV4S32:    case UniInVgprB32: @@ -1092,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(        break;      }      case UniInVgprS32: +    case UniInVgprS64:      case UniInVgprV2S16:      case UniInVgprV4S32: {        assert(Ty == getTyFromID(MethodIDs[OpIdx])); @@ -1120,7 +1154,8 @@ void RegBankLegalizeHelper::applyMappingDst(        assert(RB == SgprRB);        Register NewDst = MRI.createVirtualRegister(SgprRB_S32);        Op.setReg(NewDst); -      B.buildTrunc(Reg, NewDst); +      if (!MRI.use_empty(Reg)) +        B.buildTrunc(Reg, NewDst);        break;      }      case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..e7598f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper {    static constexpr LLT P6 = LLT::pointer(6, 32);    MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; +  MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};    MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};    MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,9 +122,11 @@ private:    void lowerV_BFE(MachineInstr &MI);    void lowerS_BFE(MachineInstr &MI);    void lowerSplitTo32(MachineInstr &MI); +  void lowerSplitTo16(MachineInstr &MI);    void lowerSplitTo32Select(MachineInstr &MI);    void lowerSplitTo32SExtInReg(MachineInstr &MI);    void lowerUnpackMinMax(MachineInstr &MI); +  void lowerUnpackAExt(MachineInstr &MI);  };  } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..b22e9bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,        .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})        .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})        .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) -      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); +      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) +      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) +      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) +      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) +      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + +  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + +  addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});    addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -906,9 +918,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,    bool hasSALUFloat = ST->hasSALUFloatInsts();    addRulesForGOpcs({G_FADD}, Standard) +      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) +      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) +      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})        .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)        .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) -      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); +      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) +      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) +      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) +      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) +      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, +           hasSALUFloat) +      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) +      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) +      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});    addRulesForGOpcs({G_FPTOUI})        .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..e6df5d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {    V4S32,    UniV2S16, +  UniV2S32,    DivV2S16, +  DivV2S32,    // B types    B32, @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {    UniInVcc,    UniInVgprS16,    UniInVgprS32, +  UniInVgprS64,    UniInVgprV2S16, +  UniInVgprV2S32,    UniInVgprV4S32,    UniInVgprB32,    UniInVgprB64, @@ -217,13 +221,15 @@ enum LoweringMethodID {    V_BFE,    VgprToVccCopy,    SplitTo32, +  ScalarizeToS16,    SplitTo32Select,    SplitTo32SExtInReg,    Ext32To64,    UniCstExt,    SplitLoad,    WidenLoad, -  WidenMMOToS32 +  WidenMMOToS32, +  UnpackAExt  };  enum FastRulesTypes { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..b28c50e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() {        isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); +    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {    if (isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(AMDGPUUniformIntrinsicCombinePass());    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..8bb2808 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,    SDLoc SL(N);    if (Src.getOpcode() == ISD::SETCC) { +    SDValue Op0 = Src.getOperand(0); +    SDValue Op1 = Src.getOperand(1); +    // Need to expand bfloat to float for comparison (setcc). +    if (Op0.getValueType() == MVT::bf16) { +      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); +      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); +    }      // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) -    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), -                       Src.getOperand(1), Src.getOperand(2)); +    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));    }    if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {      // (ballot 0) -> 0 diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d80a6f3..a6c1af2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore(        }      } +    Register FinalValueReg = ValueReg; +    if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { +      // If we are loading 16-bit value with SRAMECC endabled we need a temp +      // 32-bit VGPR to load and extract 16-bits into the final register. +      ValueReg = +          RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); +      SubReg = ValueReg; +      IsKill = false; +    } +      MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);      MachineMemOperand *NewMMO =          MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, @@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore(        MIB.addImm(0); // swz      MIB.addMemOperand(NewMMO); +    if (FinalValueReg != ValueReg) { +      // Extract 16-bit from the loaded 32-bit value. +      ValueReg = getSubReg(ValueReg, AMDGPU::lo16); +      MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) +                .addReg(FinalValueReg, getDefRegState(true)) +                .addImm(0) +                .addReg(ValueReg, getKillRegState(true)) +                .addImm(0); +      ValueReg = FinalValueReg; +    } +      if (!IsAGPR && NeedSuperRegDef)        MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,        unsigned Opc;        if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {          assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); -        Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; +        Opc = ST.d16PreservesUnusedBits() +                  ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 +                  : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;        } else {          Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE                    ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR | 
