diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
7 files changed, 58 insertions, 10 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..aed325c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {  bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {    const DebugLoc &DL = I.getDebugLoc();    MachineBasicBlock *BB = I.getParent(); +  Register VCCReg = I.getOperand(1).getReg(); +  MachineInstr *Cmp; + +  if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { +    unsigned CmpOpc = +        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; +    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); +  } else { +    // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 +    // which sets SCC as a side effect. +    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); +    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) +              .addReg(VCCReg) +              .addReg(VCCReg); +  } -  unsigned CmpOpc = -      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; -  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) -                          .addReg(I.getOperand(1).getReg()) -                          .addImm(0);    if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))      return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..b84c30e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {    MI.eraseFromParent();  } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { +  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); +  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); +  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); +  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); +  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), +                          {ResLo.getReg(0), ResHi.getReg(0)}); +  MI.eraseFromParent(); +} +  static bool isSignedBFE(MachineInstr &MI) {    if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))      return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -804,6 +814,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,      }      break;    } +  case UnpackAExt: +    return lowerUnpackAExt(MI);    case WidenMMOToS32:      return widenMMOToS32(cast<GAnyLoad>(MI));    } @@ -1120,7 +1132,8 @@ void RegBankLegalizeHelper::applyMappingDst(        assert(RB == SgprRB);        Register NewDst = MRI.createVirtualRegister(SgprRB_S32);        Op.setReg(NewDst); -      B.buildTrunc(Reg, NewDst); +      if (!MRI.use_empty(Reg)) +        B.buildTrunc(Reg, NewDst);        break;      }      case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..ad3ff1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -124,6 +124,7 @@ private:    void lowerSplitTo32Select(MachineInstr &MI);    void lowerSplitTo32SExtInReg(MachineInstr &MI);    void lowerUnpackMinMax(MachineInstr &MI); +  void lowerUnpackAExt(MachineInstr &MI);  };  } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..01abd35 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,        .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})        .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})        .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) -      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); +      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) +      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) +      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) +      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) +      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + +  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + +  addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});    addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..030bd75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -223,7 +223,8 @@ enum LoweringMethodID {    UniCstExt,    SplitLoad,    WidenLoad, -  WidenMMOToS32 +  WidenMMOToS32, +  UnpackAExt  };  enum FastRulesTypes { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..b28c50e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() {        isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); +    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {    if (isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(AMDGPUUniformIntrinsicCombinePass());    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..8bb2808 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,    SDLoc SL(N);    if (Src.getOpcode() == ISD::SETCC) { +    SDValue Op0 = Src.getOperand(0); +    SDValue Op1 = Src.getOperand(1); +    // Need to expand bfloat to float for comparison (setcc). +    if (Op0.getValueType() == MVT::bf16) { +      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); +      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); +    }      // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) -    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), -                       Src.getOperand(1), Src.getOperand(2)); +    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));    }    if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {      // (ballot 0) -> 0 | 
