diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
20 files changed, 223 insertions, 99 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ea32748..1c8383c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", "true", "Has v_mad_u32 instruction">; +def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts", + "HasAddMinMaxInsts", + "true", + "Has v_add_{min|max}_{i|u}32 instructions" +>; + +def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts", + "HasPkAddMinMaxInsts", + "true", + "Has v_pk_add_{min|max}_{i|u}16 instructions" +>; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureLshlAddU64Inst, FeatureAddSubU64Insts, FeatureMadU32Inst, + FeatureAddMinMaxInsts, + FeaturePkAddMinMaxInsts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, Feature45BitNumRecordsBufferResource, @@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts : def HasAddMinMaxInsts : Predicate<"Subtarget->hasAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>; def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>; def HasPkMinMax3Insts : Predicate<"Subtarget->hasPkMinMax3Insts()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[ combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), + [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), + (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< + (defs root:$dst), + (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), + (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), + (G_OR $or, $x_lo, $y), + (G_MERGE_VALUES $dst, $or, $x_hi))>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg, combine_shuffle_vector_to_build_vector, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 1b4b113..6bad4dbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -131,7 +131,7 @@ protected: public: MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() override = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -154,7 +154,7 @@ protected: public: MetadataStreamerMsgPackV5() = default; - ~MetadataStreamerMsgPackV5() = default; + ~MetadataStreamerMsgPackV5() override = default; }; class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { @@ -163,7 +163,7 @@ protected: public: MetadataStreamerMsgPackV6() = default; - ~MetadataStreamerMsgPackV6() = default; + ~MetadataStreamerMsgPackV6() override = default; void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF, msgpack::MapDocNode Kern) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8ed4062..1b559a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, - MVT::i32, Legal); + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, + Legal); setOperationAction( {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..1a13b22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FPOpActions.clampMaxNumElementsStrict(0, S32, 2); } + auto &MinNumMaxNumIeee = + getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNumIeee.legalFor(FPTypesPK16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); + } else { + MinNumMaxNumIeee.legalFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } + auto &MinNumMaxNum = getActionDefinitionsBuilder( - {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, - G_FMAXNUM_IEEE}); + {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM}); if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) @@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(FPTypesPK16) .clampMaxNumElements(0, S16, 2) .scalarize(0); + } else if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .lowerFor({V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .lower(); } else { - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); } getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom( case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINIMUMNUM: case TargetOpcode::G_FMAXIMUMNUM: - case TargetOpcode::G_FMINNUM_IEEE: - case TargetOpcode::G_FMAXNUM_IEEE: return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); @@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || - MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - - // With ieee_mode disabled, the instructions have the correct behavior - // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. - // - // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode - // enabled. - if (!MFI->getMode().IEEE) { - if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || - MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) - return true; - - return !IsIEEEOp; - } - - if (IsIEEEOp) + // With ieee_mode disabled, the instructions have the correct behavior. + if (!MFI->getMode().IEEE) return true; return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index c5c9473..0804133 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -26,20 +26,19 @@ struct PerFunctionMIParsingState; class AMDGPUMIRFormatter final : public MIRFormatter { public: AMDGPUMIRFormatter() = default; - virtual ~AMDGPUMIRFormatter() = default; + ~AMDGPUMIRFormatter() override = default; /// Implement target specific printing for machine operand immediate value, so /// that we can have more meaningful mnemonic than a 64-bit integer. Passing /// None to OpIdx means the index is unknown. - virtual void printImm(raw_ostream &OS, const MachineInstr &MI, - std::optional<unsigned> OpIdx, - int64_t Imm) const override; + void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned> OpIdx, int64_t Imm) const override; /// Implement target specific parsing of immediate mnemonics. The mnemonic is /// a string with a leading dot. - virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, - StringRef Src, int64_t &Imm, - ErrorCallbackType ErrorCallback) const override; + bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; /// Implement target specific parsing of target custom pseudo source value. bool diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 56807a4..54ba2f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_perm_pk16_b4_u4: case Intrinsic::amdgcn_perm_pk16_b6_u4: case Intrinsic::amdgcn_perm_pk16_b8_u4: + case Intrinsic::amdgcn_add_max_i32: + case Intrinsic::amdgcn_add_max_u32: + case Intrinsic::amdgcn_add_min_i32: + case Intrinsic::amdgcn_add_min_u32: + case Intrinsic::amdgcn_pk_add_max_i16: + case Intrinsic::amdgcn_pk_add_max_u16: + case Intrinsic::amdgcn_pk_add_min_i16: + case Intrinsic::amdgcn_pk_add_min_u16: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 996b55f..6214f4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -740,7 +740,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { return "r600"; } -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel() { // The AMDGPU toolchain only supports generating shared objects, so we // must always use PIC. return Reloc::PIC_; @@ -754,8 +754,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OptLevel) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), + getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small), + OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); if (TT.isAMDGCN()) { @@ -2086,7 +2086,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); - addPass(AtomicExpandPass(&TM)); + addPass(AtomicExpandPass(TM)); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 99ba043..5580e4c 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1860,7 +1860,6 @@ private: bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); - bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands); bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands); bool validateWMMA(const MCInst &Inst, const OperandVector &Operands); unsigned getConstantBusLimit(unsigned Opcode) const; @@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst, return true; } -bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst, - const OperandVector &Operands) { - if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12) - return true; - - int Simm16Pos = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16); - if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) { - SMLoc Loc = Operands[1]->getStartLoc(); - Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]"); - return false; - } - - return true; -} - bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, const OperandVector &Operands) { unsigned Opc = Inst.getOpcode(); @@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc, if (!validateTFE(Inst, Operands)) { return false; } - if (!validateSetVgprMSB(Inst, Operands)) { - return false; - } if (!validateWMMA(Inst, Operands)) { return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a466780..ac660d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -277,6 +277,8 @@ protected: bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; bool HasMadU32Inst = false; + bool HasAddMinMaxInsts = false; + bool HasPkAddMinMaxInsts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -1567,10 +1569,10 @@ public: bool hasIntMinMax64() const { return GFX1250Insts; } // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. - bool hasAddMinMaxInsts() const { return GFX1250Insts; } + bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; } // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. - bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } + bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; } // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. bool hasPkMinMax3Insts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index cbc7427..4d0c163 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -32,7 +32,7 @@ public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~AMDGPUInstrPostProcess() = default; + ~AMDGPUInstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; @@ -88,7 +88,7 @@ public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); - ~AMDGPUCustomBehaviour() = default; + ~AMDGPUCustomBehaviour() override = default; /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 54fcd2a..246a3f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -64,7 +64,7 @@ private: ArrayRef<const MCExpr *> Args; AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); - ~AMDGPUMCExpr(); + ~AMDGPUMCExpr() override; bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const; bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 09ef6ac..2aa54c9 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); - // 32-bit ABS is legal for AMDGPU except for R600 - setOperationAction(ISD::ABS, MVT::i32, Expand); - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a757421..be42291 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); - setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); + setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2ff2d2f..d930a21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; + const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, + this]() -> bool { + if (CmpValue != 0) + return false; + + MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); + if (!Def || Def->getParent() != CmpInstr.getParent()) + return false; + + const auto foldableSelect = [](MachineInstr *Def) -> bool { + if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || + Def->getOpcode() == AMDGPU::S_CSELECT_B64) { + bool Op1IsNonZeroImm = + Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; + if (Op1IsNonZeroImm && Op2IsZeroImm) + return true; + } + return false; + }; + + // For S_OP that set SCC = DST!=0, do the transformation + // + // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) + + // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value + // for S_CSELECT* already has the same value that will be calculated by + // s_cmp_lg_* + // + // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero + // imm), 0) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + return false; + + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : + make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + + if (MachineOperand *SccDef = + Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + CmpInstr.eraseFromParent(); + return true; + }; + const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { @@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(AMDGPU::SCC, &RI) || - I->killsRegister(AMDGPU::SCC, &RI)) + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : + make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; } MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); CmpInstr.eraseFromParent(); if (!MRI->use_nodbg_empty(DefReg)) { @@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false); + return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false); + return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e1d7a07..dc23a21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -714,6 +714,52 @@ public: } } + static bool setsSCCifResultIsNonZero(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_ABSDIFF_I32: + case AMDGPU::S_ABS_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: + case AMDGPU::S_ANDN2_B32: + case AMDGPU::S_ANDN2_B64: + case AMDGPU::S_ASHR_I32: + case AMDGPU::S_ASHR_I64: + case AMDGPU::S_BCNT0_I32_B32: + case AMDGPU::S_BCNT0_I32_B64: + case AMDGPU::S_BCNT1_I32_B32: + case AMDGPU::S_BCNT1_I32_B64: + case AMDGPU::S_BFE_I32: + case AMDGPU::S_BFE_I64: + case AMDGPU::S_BFE_U32: + case AMDGPU::S_BFE_U64: + case AMDGPU::S_LSHL_B32: + case AMDGPU::S_LSHL_B64: + case AMDGPU::S_LSHR_B32: + case AMDGPU::S_LSHR_B64: + case AMDGPU::S_NAND_B32: + case AMDGPU::S_NAND_B64: + case AMDGPU::S_NOR_B32: + case AMDGPU::S_NOR_B64: + case AMDGPU::S_NOT_B32: + case AMDGPU::S_NOT_B64: + case AMDGPU::S_OR_B32: + case AMDGPU::S_OR_B64: + case AMDGPU::S_ORN2_B32: + case AMDGPU::S_ORN2_B64: + case AMDGPU::S_QUADMASK_B32: + case AMDGPU::S_QUADMASK_B64: + case AMDGPU::S_WQM_B32: + case AMDGPU::S_WQM_B64: + case AMDGPU::S_XNOR_B32: + case AMDGPU::S_XNOR_B64: + case AMDGPU::S_XOR_B32: + case AMDGPU::S_XOR_B64: + return true; + default: + return false; + } + } + static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } @@ -1594,7 +1640,7 @@ public: unsigned *PredCost = nullptr) const override; InstructionUniformity - getInstructionUniformity(const MachineInstr &MI) const override final; + getInstructionUniformity(const MachineInstr &MI) const final; InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 2c1a13c..019c3b7 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { const llvm::MachineFunction &MF); void mappingImpl(yaml::IO &YamlIO) override; - ~SIMachineFunctionInfo() = default; + ~SIMachineFunctionInfo() override = default; }; template <> struct MappingTraits<SIMachineFunctionInfo> { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 07264d9..a177a42 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -640,7 +640,7 @@ public: bool finalizeStore(MachineInstr &MI, bool Atomic) const override; - virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; + bool handleCooperativeAtomic(MachineInstr &MI) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 7cce033..05ba76a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; @@ -976,10 +976,10 @@ def : GCNPat < } // End SubtargetPredicate = HasLshlAddU64Inst let SubtargetPredicate = HasAddMinMaxInsts in { -def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; -def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; -def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; -def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>; } def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 6500fce..4ae2c1e 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo<OpName, P, !if (P.HasModifiers, - getVOP3PModPat<P, node, IsDOT, IsDOT>.ret, + getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret, getVOP3Pat<P, node>.ret)>; let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then @@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>; } // End SubtargetPredicate = HasFmaMixBF16Insts def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { - let HasModifiers = 0; + let HasNeg = 0; + let EnableClamp = 1; } let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkAddMinMaxInsts in { -defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>; +defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>; +defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>; +defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>; } let SubtargetPredicate = HasPkMinMax3Insts in { defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>; @@ -463,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2, >; let SubtargetPredicate = HasPkAddMinMaxInsts in { -def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; -def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; -def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; -def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>; } let SubtargetPredicate = HasPkMinMax3Insts in { |
