diff options
Diffstat (limited to 'llvm/lib/Target')
30 files changed, 404 insertions, 51 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..aed325c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {  bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {    const DebugLoc &DL = I.getDebugLoc();    MachineBasicBlock *BB = I.getParent(); +  Register VCCReg = I.getOperand(1).getReg(); +  MachineInstr *Cmp; + +  if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { +    unsigned CmpOpc = +        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; +    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); +  } else { +    // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 +    // which sets SCC as a side effect. +    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); +    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) +              .addReg(VCCReg) +              .addReg(VCCReg); +  } -  unsigned CmpOpc = -      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; -  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) -                          .addReg(I.getOperand(1).getReg()) -                          .addImm(0);    if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))      return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..b84c30e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {    MI.eraseFromParent();  } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { +  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); +  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); +  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); +  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); +  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), +                          {ResLo.getReg(0), ResHi.getReg(0)}); +  MI.eraseFromParent(); +} +  static bool isSignedBFE(MachineInstr &MI) {    if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))      return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -804,6 +814,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,      }      break;    } +  case UnpackAExt: +    return lowerUnpackAExt(MI);    case WidenMMOToS32:      return widenMMOToS32(cast<GAnyLoad>(MI));    } @@ -1120,7 +1132,8 @@ void RegBankLegalizeHelper::applyMappingDst(        assert(RB == SgprRB);        Register NewDst = MRI.createVirtualRegister(SgprRB_S32);        Op.setReg(NewDst); -      B.buildTrunc(Reg, NewDst); +      if (!MRI.use_empty(Reg)) +        B.buildTrunc(Reg, NewDst);        break;      }      case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..ad3ff1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -124,6 +124,7 @@ private:    void lowerSplitTo32Select(MachineInstr &MI);    void lowerSplitTo32SExtInReg(MachineInstr &MI);    void lowerUnpackMinMax(MachineInstr &MI); +  void lowerUnpackAExt(MachineInstr &MI);  };  } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..01abd35 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,        .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})        .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})        .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) -      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); +      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) +      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) +      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) +      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) +      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + +  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + +  addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});    addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..030bd75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -223,7 +223,8 @@ enum LoweringMethodID {    UniCstExt,    SplitLoad,    WidenLoad, -  WidenMMOToS32 +  WidenMMOToS32, +  UnpackAExt  };  enum FastRulesTypes { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..b28c50e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() {        isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); +    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {    if (isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(AMDGPUUniformIntrinsicCombinePass());    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..8bb2808 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,    SDLoc SL(N);    if (Src.getOpcode() == ISD::SETCC) { +    SDValue Op0 = Src.getOperand(0); +    SDValue Op1 = Src.getOperand(1); +    // Need to expand bfloat to float for comparison (setcc). +    if (Op0.getValueType() == MVT::bf16) { +      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); +      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); +    }      // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) -    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), -                       Src.getOperand(1), Src.getOperand(2)); +    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));    }    if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {      // (ballot 0) -> 0 diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a4d3d62..6b06534 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -22109,6 +22109,11 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(            ScalarTy->isIntegerTy(32));  } +ArrayRef<MCPhysReg> ARMTargetLowering::getRoundingControlRegisters() const { +  static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM}; +  return RCRegs; +} +  Value *ARMTargetLowering::createComplexDeinterleavingIR(      IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,      ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 357d2c5..bf3438b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1009,6 +1009,8 @@ class VectorType;      bool isUnsupportedFloatingType(EVT VT) const; +    ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; +      SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,                      SDValue ARMcc, SDValue Flags, SelectionDAG &DAG) const;      SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 3b810d0..79863e1 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -34,7 +34,7 @@ class HexagonCopyHoisting : public MachineFunctionPass {  public:    static char ID; -  HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {} +  HexagonCopyHoisting() : MachineFunctionPass(ID) {}    StringRef getPassName() const override { return "Hexagon Copy Hoisting"; } @@ -56,8 +56,8 @@ public:    void moveCopyInstr(MachineBasicBlock *DestBB,                       std::pair<Register, Register> Key, MachineInstr *MI); -  MachineFunction *MFN; -  MachineRegisterInfo *MRI; +  MachineFunction *MFN = nullptr; +  MachineRegisterInfo *MRI = nullptr;    std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>>        CopyMIList;  }; diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index 93418f7..a10c937 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -34,13 +34,13 @@ STATISTIC(HexagonNumStoreAbsConversions,  namespace {  class HexagonGenMemAbsolute : public MachineFunctionPass { -  const HexagonInstrInfo *TII; -  MachineRegisterInfo *MRI; -  const TargetRegisterInfo *TRI; +  const HexagonInstrInfo *TII = nullptr; +  MachineRegisterInfo *MRI = nullptr; +  const TargetRegisterInfo *TRI = nullptr;  public:    static char ID; -  HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {} +  HexagonGenMemAbsolute() : MachineFunctionPass(ID) {}    StringRef getPassName() const override {      return "Hexagon Generate Load/Store Set Absolute Address Instruction"; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 1637b91..d19920c 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -612,6 +612,9 @@ let Predicates = [UseHVX] in {             (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>;    def: Pat<(VecQ32 (trunc HVI32:$Vs)),             (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; +  def: Pat<(VecQ16 (trunc HWI32:$Vss)), +           (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))), +           (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>;  }  let Predicates = [UseHVX] in { diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index b9cdd6a..ce2de75 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -544,7 +544,7 @@ int HexagonSubtarget::updateLatency(MachineInstr &SrcInst,    if (!hasV60Ops())      return Latency; -  auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo()); +  const HexagonInstrInfo &QII = *getInstrInfo();    // BSB scheduling.    if (QII.isHVXVec(SrcInst) || useBSBScheduling())      Latency = (Latency + 1) >> 1; diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp index 71bdfc66..5a85f34 100644 --- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -43,7 +43,7 @@ namespace {  class HexagonTfrCleanup : public MachineFunctionPass {  public:    static char ID; -  HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {} +  HexagonTfrCleanup() : MachineFunctionPass(ID) {}    StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesAll(); @@ -52,8 +52,8 @@ public:    bool runOnMachineFunction(MachineFunction &MF) override;  private: -  const HexagonInstrInfo *HII; -  const TargetRegisterInfo *TRI; +  const HexagonInstrInfo *HII = nullptr; +  const TargetRegisterInfo *TRI = nullptr;    typedef DenseMap<unsigned, uint64_t> ImmediateMap; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 690dd73..e86b21c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))),  // FP Rounding  let Predicates = [HasBasicF, IsLA64] in {  def : PatFpr<frint, FRINT_S, FPR32>; +def : PatFpr<flog2, FLOGB_S, FPR32>;  } // Predicates = [HasBasicF, IsLA64]  let Predicates = [HasBasicF, IsLA32] in { diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index daefbaa..2e88254 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>;  // FP Rounding  let Predicates = [HasBasicD, IsLA64] in {  def : PatFpr<frint, FRINT_D, FPR64>; +def : PatFpr<flog2, FLOGB_D, FPR64>;  } // Predicates = [HasBasicD, IsLA64]  /// Pseudo-instructions needed for the soft-float ABI with LA32D diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6..a6de839 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::FP_TO_BF16, MVT::f32,                         Subtarget.isSoftFPABI() ? LibCall : Custom); -    if (Subtarget.is64Bit()) +    if (Subtarget.is64Bit()) {        setOperationAction(ISD::FRINT, MVT::f32, Legal); +      setOperationAction(ISD::FLOG2, MVT::f32, Legal); +    }      if (!Subtarget.hasBasicD()) {        setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); @@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::FP_TO_BF16, MVT::f64,                         Subtarget.isSoftFPABI() ? LibCall : Custom); -    if (Subtarget.is64Bit()) +    if (Subtarget.is64Bit()) {        setOperationAction(ISD::FRINT, MVT::f64, Legal); +      setOperationAction(ISD::FLOG2, MVT::f64, Legal); +    }    }    // Set operations for 'LSX' feature. @@ -362,6 +366,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,        setOperationAction(ISD::FMA, VT, Legal);        setOperationAction(ISD::FSQRT, VT, Legal);        setOperationAction(ISD::FNEG, VT, Legal); +      setOperationAction(ISD::FLOG2, VT, Legal);        setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,                           ISD::SETUGE, ISD::SETUGT},                          VT, Expand); @@ -443,6 +448,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,        setOperationAction(ISD::FMA, VT, Legal);        setOperationAction(ISD::FSQRT, VT, Legal);        setOperationAction(ISD::FNEG, VT, Legal); +      setOperationAction(ISD::FLOG2, VT, Legal);        setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,                           ISD::SETUGE, ISD::SETUGT},                          VT, Expand); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 613dea6..ca4ee5f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1593,6 +1593,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa),  // XVFSQRT_{S/D}  defm : PatXrF<fsqrt, "XVFSQRT">; +// XVFLOGB_{S/D} +defm : PatXrF<flog2, "XVFLOGB">; +  // XVRECIP_{S/D}  def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj),            (XVFRECIP_S v8f32:$xj)>; @@ -2024,6 +2027,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),                 (XVFTINTRZ_LU_D v4f64:$vj)),                sub_128)>; +// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "XVAVG_B", v32i8>; +defm : VAvgPat<sra, "XVAVG_H", v16i16>; +defm : VAvgPat<sra, "XVAVG_W", v8i32>; +defm : VAvgPat<sra, "XVAVG_D", v4i64>; +defm : VAvgPat<srl, "XVAVG_BU", v32i8>; +defm : VAvgPat<srl, "XVAVG_HU", v16i16>; +defm : VAvgPat<srl, "XVAVG_WU", v8i32>; +defm : VAvgPat<srl, "XVAVG_DU", v4i64>; +defm : VAvgrPat<sra, "XVAVGR_B", v32i8>; +defm : VAvgrPat<sra, "XVAVGR_H", v16i16>; +defm : VAvgrPat<sra, "XVAVGR_W", v8i32>; +defm : VAvgrPat<sra, "XVAVGR_D", v4i64>; +defm : VAvgrPat<srl, "XVAVGR_BU", v32i8>; +defm : VAvgrPat<srl, "XVAVGR_HU", v16i16>; +defm : VAvgrPat<srl, "XVAVGR_WU", v8i32>; +defm : VAvgrPat<srl, "XVAVGR_DU", v4i64>; +  // abs  def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>;  def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 4619c6b..92402ba 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {    }  } +multiclass VAvgPat<SDPatternOperator OpNode, string Inst, ValueType vt> { +  def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))), +            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} + +multiclass VAvgrPat<SDPatternOperator OpNode, string Inst, ValueType vt> { +  def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)), +                             (vt (vsplat_imm_eq_1)))), +                    (vt (vsplat_imm_eq_1))), +            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} +  let Predicates = [HasExtLSX] in {  // VADD_{B/H/W/D} @@ -1783,6 +1795,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va),  // VFSQRT_{S/D}  defm : PatVrF<fsqrt, "VFSQRT">; +// VFLOGB_{S/D} +defm : PatVrF<flog2, "VFLOGB">; +  // VFRECIP_{S/D}  def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj),            (VFRECIP_S v4f32:$vj)>; @@ -2154,6 +2169,24 @@ def : Pat<(f32 f32imm_vldi:$in),  def : Pat<(f64 f64imm_vldi:$in),            (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "VAVG_B", v16i8>; +defm : VAvgPat<sra, "VAVG_H", v8i16>; +defm : VAvgPat<sra, "VAVG_W", v4i32>; +defm : VAvgPat<sra, "VAVG_D", v2i64>; +defm : VAvgPat<srl, "VAVG_BU", v16i8>; +defm : VAvgPat<srl, "VAVG_HU", v8i16>; +defm : VAvgPat<srl, "VAVG_WU", v4i32>; +defm : VAvgPat<srl, "VAVG_DU", v2i64>; +defm : VAvgrPat<sra, "VAVGR_B", v16i8>; +defm : VAvgrPat<sra, "VAVGR_H", v8i16>; +defm : VAvgrPat<sra, "VAVGR_W", v4i32>; +defm : VAvgrPat<sra, "VAVGR_D", v2i64>; +defm : VAvgrPat<srl, "VAVGR_BU", v16i8>; +defm : VAvgrPat<srl, "VAVGR_HU", v8i16>; +defm : VAvgrPat<srl, "VAVGR_WU", v4i32>; +defm : VAvgrPat<srl, "VAVGR_DU", v2i64>; +  // abs  def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>;  def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 000d296..4ff489d 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,                                     std::optional<Reloc::Model> RM,                                     std::optional<CodeModel::Model> CM,                                     CodeGenOptLevel OL, bool JIT) -    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, -                               computeFSAdditions(FS, OL, TT), Options, +    : CodeGenTargetMachineImpl(T, +                               TT.computeDataLayout(Options.MCOptions.ABIName), +                               TT, CPU, computeFSAdditions(FS, OL, TT), Options,                                 getEffectiveRelocModel(TT, RM),                                 getEffectivePPCCodeModel(TT, CM, JIT), OL),        TLOF(createTLOF(getTargetTriple())), diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 8198173..282cf5d 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,10 @@ private:    void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,                   MachineIRBuilder &MIB) const;    bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; +  void addVectorLoadStoreOperands(MachineInstr &I, +                                  SmallVectorImpl<SrcOp> &SrcOps, +                                  unsigned &CurOp, bool IsMasked, +                                  bool IsStrided) const;    bool selectIntrinsicWithSideEffects(MachineInstr &I,                                        MachineIRBuilder &MIB) const; @@ -716,6 +720,26 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {    return GenericOpc;  } +void RISCVInstructionSelector::addVectorLoadStoreOperands( +    MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, +    bool IsMasked, bool IsStrided) const { +  // Base Pointer +  auto PtrReg = I.getOperand(CurOp++).getReg(); +  SrcOps.push_back(PtrReg); + +  // Stride +  if (IsStrided) { +    auto StrideReg = I.getOperand(CurOp++).getReg(); +    SrcOps.push_back(StrideReg); +  } + +  // Mask +  if (IsMasked) { +    auto MaskReg = I.getOperand(CurOp++).getReg(); +    SrcOps.push_back(MaskReg); +  } +} +  bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(      MachineInstr &I, MachineIRBuilder &MIB) const {    // Find the intrinsic ID. @@ -752,21 +776,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(        SrcOps.push_back(Register(RISCV::NoRegister));      } -    // Base Pointer -    auto PtrReg = I.getOperand(CurOp++).getReg(); -    SrcOps.push_back(PtrReg); - -    // Stride -    if (IsStrided) { -      auto StrideReg = I.getOperand(CurOp++).getReg(); -      SrcOps.push_back(StrideReg); -    } - -    // Mask -    if (IsMasked) { -      auto MaskReg = I.getOperand(CurOp++).getReg(); -      SrcOps.push_back(MaskReg); -    } +    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided);      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));      const RISCV::VLEPseudo *P = @@ -795,6 +805,48 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(      I.eraseFromParent();      return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);    } +  case Intrinsic::riscv_vsm: +  case Intrinsic::riscv_vse: +  case Intrinsic::riscv_vse_mask: +  case Intrinsic::riscv_vsse: +  case Intrinsic::riscv_vsse_mask: { +    bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask || +                    IntrinID == Intrinsic::riscv_vsse_mask; +    bool IsStrided = IntrinID == Intrinsic::riscv_vsse || +                     IntrinID == Intrinsic::riscv_vsse_mask; +    LLT VT = MRI->getType(I.getOperand(1).getReg()); +    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + +    // Sources +    unsigned CurOp = 1; +    SmallVector<SrcOp, 4> SrcOps; // Source registers. + +    // Store value +    auto PassthruReg = I.getOperand(CurOp++).getReg(); +    SrcOps.push_back(PassthruReg); + +    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); + +    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); +    const RISCV::VSEPseudo *P = RISCV::getVSEPseudo( +        IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL)); + +    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + +    // Select VL +    auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); +    for (auto &RenderFn : *VLOpFn) +      RenderFn(PseudoMI); + +    // SEW +    PseudoMI.addImm(Log2SEW); + +    // Memref +    PseudoMI.cloneMemRefs(I); + +    I.eraseFromParent(); +    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); +  }    }  } diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 4105618..526675a 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -127,6 +127,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,    case RISCV::PseudoCCAND:    case RISCV::PseudoCCOR:    case RISCV::PseudoCCXOR: +  case RISCV::PseudoCCMAX: +  case RISCV::PseudoCCMAXU: +  case RISCV::PseudoCCMIN: +  case RISCV::PseudoCCMINU:    case RISCV::PseudoCCADDW:    case RISCV::PseudoCCSUBW:    case RISCV::PseudoCCSLL: @@ -217,6 +221,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,          .addImm(0);    } else {      unsigned NewOpc; +    // clang-format off      switch (MI.getOpcode()) {      default:        llvm_unreachable("Unexpected opcode!"); @@ -228,6 +233,10 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,      case RISCV::PseudoCCAND:   NewOpc = RISCV::AND;   break;      case RISCV::PseudoCCOR:    NewOpc = RISCV::OR;    break;      case RISCV::PseudoCCXOR:   NewOpc = RISCV::XOR;   break; +    case RISCV::PseudoCCMAX:   NewOpc = RISCV::MAX;   break; +    case RISCV::PseudoCCMIN:   NewOpc = RISCV::MIN;   break; +    case RISCV::PseudoCCMAXU:  NewOpc = RISCV::MAXU;  break; +    case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;      case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;      case RISCV::PseudoCCSLLI:  NewOpc = RISCV::SLLI;  break;      case RISCV::PseudoCCSRLI:  NewOpc = RISCV::SRLI;  break; @@ -250,6 +259,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,      case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break;      case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break;      } +    // clang-format on      if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) {        BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index b4556f6..cfee6ab 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1851,6 +1851,11 @@ def TuneShortForwardBranchOpt  def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">;  def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneShortForwardBranchIMinMax +    : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax", +                       "true", "Enable short forward branch optimization for min,max instructions in Zbb", +                       [TuneShortForwardBranchOpt]>; +  // Some subtargets require a S2V transfer buffer to move scalars into vectors.  // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.  def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9a6afa1..b25a054 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3995,6 +3995,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,      case RISCV::CTZW:      case RISCV::CPOPW:      case RISCV::SLLI_UW: +    case RISCV::ABSW:      case RISCV::FMV_W_X:      case RISCV::FCVT_H_W:      case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1c930ac..56881f7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -433,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    if (Subtarget.hasStdExtP() ||        (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction(ISD::ABS, XLenVT, Legal); +    if (Subtarget.is64Bit()) +      setOperationAction(ISD::ABS, MVT::i32, Custom);    } else if (Subtarget.hasShortForwardBranchOpt()) {      // We can use PseudoCCSUB to implement ABS.      setOperationAction(ISD::ABS, XLenVT, Legal); @@ -14816,8 +14818,16 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&             "Unexpected custom legalisation"); +    if (Subtarget.hasStdExtP()) { +      SDValue Src = +          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); +      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); +      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs)); +      return; +    } +      if (Subtarget.hasStdExtZbb()) { -      // Emit a special ABSW node that will be expanded to NEGW+MAX at isel. +      // Emit a special node that will be expanded to NEGW+MAX at isel.        // This allows us to remember that the result is sign extended. Expanding        // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.        SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, @@ -20290,6 +20300,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,      break;    } +  case RISCVISD::ABSW:    case RISCVISD::CLZW:    case RISCVISD::CTZW: {      // Only the lower 32 bits of the first operand are read @@ -21862,6 +21873,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(    case RISCVISD::REMUW:    case RISCVISD::ROLW:    case RISCVISD::RORW: +  case RISCVISD::ABSW:    case RISCVISD::FCVT_W_RV64:    case RISCVISD::FCVT_WU_RV64:    case RISCVISD::STRICT_FCVT_W_RV64: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 912b82d..3a7013d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1699,6 +1699,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) {    case RISCV::AND:   return RISCV::PseudoCCAND;    case RISCV::OR:    return RISCV::PseudoCCOR;    case RISCV::XOR:   return RISCV::PseudoCCXOR; +  case RISCV::MAX:   return RISCV::PseudoCCMAX; +  case RISCV::MAXU:  return RISCV::PseudoCCMAXU; +  case RISCV::MIN:   return RISCV::PseudoCCMIN; +  case RISCV::MINU:  return RISCV::PseudoCCMINU;    case RISCV::ADDI:  return RISCV::PseudoCCADDI;    case RISCV::SLLI:  return RISCV::PseudoCCSLLI; @@ -1735,7 +1739,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) {  /// return the defining instruction.  static MachineInstr *canFoldAsPredicatedOp(Register Reg,                                             const MachineRegisterInfo &MRI, -                                           const TargetInstrInfo *TII) { +                                           const TargetInstrInfo *TII, +                                           const RISCVSubtarget &STI) {    if (!Reg.isVirtual())      return nullptr;    if (!MRI.hasOneNonDBGUse(Reg)) @@ -1743,6 +1748,12 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,    MachineInstr *MI = MRI.getVRegDef(Reg);    if (!MI)      return nullptr; + +  if (!STI.hasShortForwardBranchIMinMax() && +      (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN || +       MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) +    return nullptr; +    // Check if MI can be predicated and folded into the CCMOV.    if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)      return nullptr; @@ -1806,10 +1817,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,    MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();    MachineInstr *DefMI = -      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this); +      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI);    bool Invert = !DefMI;    if (!DefMI) -    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this); +    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI);    if (!DefMI)      return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index cc085bb..4cbbba3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1461,5 +1461,10 @@ let Predicates = [HasStdExtP, IsRV32] in {  // Codegen patterns  //===----------------------------------------------------------------------===// +def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; +  let Predicates = [HasStdExtP] in  def : PatGpr<abs, ABS>; + +let Predicates = [HasStdExtP, IsRV64] in +def : PatGpr<riscv_absw, ABSW>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 0114fbd..5a67a5a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -106,6 +106,10 @@ def PseudoCCSRA : SFBALU_rr;  def PseudoCCAND : SFBALU_rr;  def PseudoCCOR  : SFBALU_rr;  def PseudoCCXOR : SFBALU_rr; +def PseudoCCMAX : SFBALU_rr; +def PseudoCCMIN : SFBALU_rr; +def PseudoCCMAXU : SFBALU_rr; +def PseudoCCMINU : SFBALU_rr;  def PseudoCCADDI : SFBALU_ri;  def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index d08115b..ea98cdb 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -172,6 +172,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,        case RISCV::CTZW:        case RISCV::CPOPW:        case RISCV::SLLI_UW: +      case RISCV::ABSW:        case RISCV::FMV_W_X:        case RISCV::FCVT_H_W:        case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 624cff2..49beada 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48778,10 +48778,9 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,        SDValue BC0 = peekThroughBitcasts(Op0);        if (BC0.getOpcode() == X86ISD::PCMPEQ &&            ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) { -        SDLoc DL(EFLAGS);          CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE); -        SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0)); -        return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X); +        SDValue X = DAG.getBitcast(OpVT, DAG.getFreeze(BC0.getOperand(0))); +        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, X, X);        }      }    } @@ -48837,7 +48836,7 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,                MVT FloatSVT = MVT::getFloatingPointVT(EltBits);                MVT FloatVT =                    MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); -              Res = DAG.getBitcast(FloatVT, Res); +              Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res));                return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);              } else if (EltBits == 16) {                MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; @@ -48856,8 +48855,30 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,      }      // TESTZ(X,-1) == TESTZ(X,X) -    if (ISD::isBuildVectorAllOnes(Op1.getNode())) +    if (ISD::isBuildVectorAllOnes(Op1.getNode())) { +      Op0 = DAG.getFreeze(Op0);        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); +    } + +    // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets. +    if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) { +      KnownBits KnownOp1 = DAG.computeKnownBits(Op1); +      assert(KnownOp1.getBitWidth() == 64 && +             "Illegal PTEST vector element width"); +      if (KnownOp1.isConstant()) { +        const APInt &Mask = KnownOp1.getConstant(); +        if (Mask.isSignMask()) { +          MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); +          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); +          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); +        } +        if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { +          MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); +          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); +          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); +        } +      } +    }      // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)      // TODO: Add COND_NE handling? @@ -53480,6 +53501,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,    return SDValue();  } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, +                              SelectionDAG &DAG, +                              const X86Subtarget &Subtarget) { +  using namespace SDPatternMatch; + +  // Only handle normal stores and its chain was a matching normal load. +  auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); +  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || +      !ISD::isNormalLoad(Ld) || !Ld->isSimple() || +      Ld->getBasePtr() != St->getBasePtr() || +      Ld->getOffset() != St->getOffset()) +    return SDValue(); + +  SDValue LoadVal(Ld, 0); +  SDValue StoredVal = St->getValue(); +  EVT VT = StoredVal.getValueType(); + +  // Only narrow larger than legal scalar integers. +  if (!VT.isScalarInteger() || +      VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) +    return SDValue(); + +  // BTR: X & ~(1 << ShAmt) +  // BTS: X | (1 << ShAmt) +  // BTC: X ^ (1 << ShAmt) +  SDValue ShAmt; +  if (!StoredVal.hasOneUse() || +      !(sd_match(StoredVal, m_And(m_Specific(LoadVal), +                                  m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || +        sd_match(StoredVal, +                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || +        sd_match(StoredVal, +                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) +    return SDValue(); + +  // Ensure the shift amount is in bounds. +  KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); +  if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) +    return SDValue(); + +  // Split the shift into an alignment shift that moves the active i32 block to +  // the bottom bits for truncation and a modulo shift that can act on the i32. +  EVT AmtVT = ShAmt.getValueType(); +  SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                                 DAG.getSignedConstant(-32LL, DL, AmtVT)); +  SDValue ModuloAmt = +      DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + +  // Compute the byte offset for the i32 block that is changed by the RMW. +  // combineTruncate will adjust the load for us in a similar way. +  EVT PtrVT = St->getBasePtr().getValueType(); +  SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); +  SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, +                                   DAG.getShiftAmountConstant(3, PtrVT, DL)); +  SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, +                                            SDNodeFlags::NoUnsignedWrap); + +  // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. +  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); +  X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + +  SDValue Mask = +      DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), +                  DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); +  if (StoredVal.getOpcode() == ISD::AND) +    Mask = DAG.getNOT(DL, Mask, MVT::i32); + +  SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); +  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), +                      Align(), St->getMemOperand()->getFlags()); +} +  static SDValue combineStore(SDNode *N, SelectionDAG &DAG,                              TargetLowering::DAGCombinerInfo &DCI,                              const X86Subtarget &Subtarget) { @@ -53706,6 +53801,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,      }    } +  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) +    return R; +    // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)    //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)    if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54660,8 +54758,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,    // truncation, see if we can convert the shift into a pointer offset instead.    // Limit this to normal (non-ext) scalar integer loads.    if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && -      Src.hasOneUse() && Src.getOperand(0).hasOneUse() && -      ISD::isNormalLoad(Src.getOperand(0).getNode())) { +      Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && +      (Src.getOperand(0).hasOneUse() || +       !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {      auto *Ld = cast<LoadSDNode>(Src.getOperand(0));      if (Ld->isSimple() && VT.isByteSized() &&          isPowerOf2_64(VT.getSizeInBits())) { @@ -56459,6 +56558,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,  static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,                              TargetLowering::DAGCombinerInfo &DCI,                              const X86Subtarget &Subtarget) { +  using namespace SDPatternMatch;    const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();    const SDValue LHS = N->getOperand(0);    const SDValue RHS = N->getOperand(1); @@ -56517,6 +56617,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,        if (SDValue AndN = MatchAndCmpEq(RHS, LHS))          return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); +      // If we're performing a bit test on a larger than legal type, attempt +      // to (aligned) shift down the value to the bottom 32-bits and then +      // perform the bittest on the i32 value. +      // ICMP_ZERO(AND(X,SHL(1,IDX))) +      // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) +      if (isNullConstant(RHS) && +          OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { +        SDValue X, ShAmt; +        if (sd_match(LHS, m_OneUse(m_And(m_Value(X), +                                         m_Shl(m_One(), m_Value(ShAmt)))))) { +          // Only attempt this if the shift amount is known to be in bounds. +          KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); +          if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { +            EVT AmtVT = ShAmt.getValueType(); +            SDValue AlignAmt = +                DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                            DAG.getSignedConstant(-32LL, DL, AmtVT)); +            SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                                            DAG.getConstant(31, DL, AmtVT)); +            SDValue Mask = DAG.getNode( +                ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), +                DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); +            X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); +            X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); +            X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); +            return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), +                                CC); +          } +        } +      } +        // cmpeq(trunc(x),C) --> cmpeq(x,C)        // cmpne(trunc(x),C) --> cmpne(x,C)        // iff x upper bits are zero. | 
