aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h48
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td16
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td21
20 files changed, 223 insertions, 99 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea32748..1c8383c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
"true", "Has v_mad_u32 instruction">;
+def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts",
+ "HasAddMinMaxInsts",
+ "true",
+ "Has v_add_{min|max}_{i|u}32 instructions"
+>;
+
+def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
+ "HasPkAddMinMaxInsts",
+ "true",
+ "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
FeatureMadU32Inst,
+ FeatureAddMinMaxInsts,
+ FeaturePkAddMinMaxInsts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
Feature45BitNumRecordsBufferResource,
@@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts :
def HasAddMinMaxInsts :
Predicate<"Subtarget->hasAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>;
def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>;
def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f..7f00ead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
]>;
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+ [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+ (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+ (defs root:$dst),
+ (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+ (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+ (G_OR $or, $x_lo, $y),
+ (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector_to_build_vector,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113..6bad4dbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ protected:
public:
MetadataStreamerMsgPackV4() = default;
- ~MetadataStreamerMsgPackV4() = default;
+ ~MetadataStreamerMsgPackV4() override = default;
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
@@ -154,7 +154,7 @@ protected:
public:
MetadataStreamerMsgPackV5() = default;
- ~MetadataStreamerMsgPackV5() = default;
+ ~MetadataStreamerMsgPackV5() override = default;
};
class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ protected:
public:
MetadataStreamerMsgPackV6() = default;
- ~MetadataStreamerMsgPackV6() = default;
+ ~MetadataStreamerMsgPackV6() override = default;
void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8ed4062..1b559a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
- MVT::i32, Legal);
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
+ Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a895..1a13b22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
}
+ auto &MinNumMaxNumIeee =
+ getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypesPK16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+ } else {
+ MinNumMaxNumIeee.legalFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
+
auto &MinNumMaxNum = getActionDefinitionsBuilder(
- {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
- G_FMAXNUM_IEEE});
+ {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
@@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
+ } else if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .lowerFor({V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .lower();
} else {
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .lower();
}
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINIMUMNUM:
case TargetOpcode::G_FMAXIMUMNUM:
- case TargetOpcode::G_FMINNUM_IEEE:
- case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
@@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
- MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
- // With ieee_mode disabled, the instructions have the correct behavior
- // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
- //
- // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
- // enabled.
- if (!MFI->getMode().IEEE) {
- if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
- MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
- return true;
-
- return !IsIEEEOp;
- }
-
- if (IsIEEEOp)
+ // With ieee_mode disabled, the instructions have the correct behavior.
+ if (!MFI->getMode().IEEE)
return true;
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473..0804133 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -26,20 +26,19 @@ struct PerFunctionMIParsingState;
class AMDGPUMIRFormatter final : public MIRFormatter {
public:
AMDGPUMIRFormatter() = default;
- virtual ~AMDGPUMIRFormatter() = default;
+ ~AMDGPUMIRFormatter() override = default;
/// Implement target specific printing for machine operand immediate value, so
/// that we can have more meaningful mnemonic than a 64-bit integer. Passing
/// None to OpIdx means the index is unknown.
- virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
- std::optional<unsigned> OpIdx,
- int64_t Imm) const override;
+ void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx, int64_t Imm) const override;
/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
- virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
- StringRef Src, int64_t &Imm,
- ErrorCallbackType ErrorCallback) const override;
+ bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
/// Implement target specific parsing of target custom pseudo source value.
bool
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a4..54ba2f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_perm_pk16_b4_u4:
case Intrinsic::amdgcn_perm_pk16_b6_u4:
case Intrinsic::amdgcn_perm_pk16_b8_u4:
+ case Intrinsic::amdgcn_add_max_i32:
+ case Intrinsic::amdgcn_add_max_u32:
+ case Intrinsic::amdgcn_add_min_i32:
+ case Intrinsic::amdgcn_add_min_u32:
+ case Intrinsic::amdgcn_pk_add_max_i16:
+ case Intrinsic::amdgcn_pk_add_max_u16:
+ case Intrinsic::amdgcn_pk_add_min_i16:
+ case Intrinsic::amdgcn_pk_add_min_u16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 996b55f..6214f4d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -740,7 +740,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
return "r600";
}
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel() {
// The AMDGPU toolchain only supports generating shared objects, so we
// must always use PIC.
return Reloc::PIC_;
@@ -754,8 +754,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOptLevel OptLevel)
: CodeGenTargetMachineImpl(
T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
- getEffectiveRelocModel(RM),
- getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
+ getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small),
+ OptLevel),
TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
if (TT.isAMDGCN()) {
@@ -2086,7 +2086,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
- addPass(AtomicExpandPass(&TM));
+ addPass(AtomicExpandPass(TM));
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(AMDGPUPromoteAllocaPass(TM));
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 99ba043..5580e4c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1860,7 +1860,6 @@ private:
bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
- bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
-bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
- const OperandVector &Operands) {
- if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
- return true;
-
- int Simm16Pos =
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
- if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
- SMLoc Loc = Operands[1]->getStartLoc();
- Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
- return false;
- }
-
- return true;
-}
-
bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
const OperandVector &Operands) {
unsigned Opc = Inst.getOpcode();
@@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,
if (!validateTFE(Inst, Operands)) {
return false;
}
- if (!validateSetVgprMSB(Inst, Operands)) {
- return false;
- }
if (!validateWMMA(Inst, Operands)) {
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a466780..ac660d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -277,6 +277,8 @@ protected:
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
bool HasMadU32Inst = false;
+ bool HasAddMinMaxInsts = false;
+ bool HasPkAddMinMaxInsts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -1567,10 +1569,10 @@ public:
bool hasIntMinMax64() const { return GFX1250Insts; }
// \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
- bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
- bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427..4d0c163 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ public:
AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~AMDGPUInstrPostProcess() = default;
+ ~AMDGPUInstrPostProcess() override = default;
void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
};
@@ -88,7 +88,7 @@ public:
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
- ~AMDGPUCustomBehaviour() = default;
+ ~AMDGPUCustomBehaviour() override = default;
/// This method is used to determine if an instruction
/// should be allowed to be dispatched. The return value is
/// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2a..246a3f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ private:
ArrayRef<const MCExpr *> Args;
AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
- ~AMDGPUMCExpr();
+ ~AMDGPUMCExpr() override;
bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 09ef6ac..2aa54c9 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
- // 32-bit ABS is legal for AMDGPU except for R600
- setOperationAction(ISD::ABS, MVT::i32, Expand);
-
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a757421..be42291 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_CC,
{MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
- setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
+ setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ff2d2f..d930a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
+ const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+ this]() -> bool {
+ if (CmpValue != 0)
+ return false;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ if (!Def || Def->getParent() != CmpInstr.getParent())
+ return false;
+
+ const auto foldableSelect = [](MachineInstr *Def) -> bool {
+ if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+ bool Op1IsNonZeroImm =
+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+ if (Op1IsNonZeroImm && Op2IsZeroImm)
+ return true;
+ }
+ return false;
+ };
+
+ // For S_OP that set SCC = DST!=0, do the transformation
+ //
+ // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+
+ // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+ // for S_CSELECT* already has the same value that will be calculated by
+ // s_cmp_lg_*
+ //
+ // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+ // imm), 0)
+ if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+ return false;
+
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+
+ if (MachineOperand *SccDef =
+ Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ CmpInstr.eraseFromParent();
+ return true;
+ };
+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
- I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
- I->killsRegister(AMDGPU::SCC, &RI))
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false);
+ return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false);
+ return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e1d7a07..dc23a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,52 @@ public:
}
}
+ static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_ABSDIFF_I32:
+ case AMDGPU::S_ABS_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ANDN2_B64:
+ case AMDGPU::S_ASHR_I32:
+ case AMDGPU::S_ASHR_I64:
+ case AMDGPU::S_BCNT0_I32_B32:
+ case AMDGPU::S_BCNT0_I32_B64:
+ case AMDGPU::S_BCNT1_I32_B32:
+ case AMDGPU::S_BCNT1_I32_B64:
+ case AMDGPU::S_BFE_I32:
+ case AMDGPU::S_BFE_I64:
+ case AMDGPU::S_BFE_U32:
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_LSHL_B32:
+ case AMDGPU::S_LSHL_B64:
+ case AMDGPU::S_LSHR_B32:
+ case AMDGPU::S_LSHR_B64:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NAND_B64:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_NOR_B64:
+ case AMDGPU::S_NOT_B32:
+ case AMDGPU::S_NOT_B64:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_ORN2_B64:
+ case AMDGPU::S_QUADMASK_B32:
+ case AMDGPU::S_QUADMASK_B64:
+ case AMDGPU::S_WQM_B32:
+ case AMDGPU::S_WQM_B64:
+ case AMDGPU::S_XNOR_B32:
+ case AMDGPU::S_XNOR_B64:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
@@ -1594,7 +1640,7 @@ public:
unsigned *PredCost = nullptr) const override;
InstructionUniformity
- getInstructionUniformity(const MachineInstr &MI) const override final;
+ getInstructionUniformity(const MachineInstr &MI) const final;
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c..019c3b7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
const llvm::MachineFunction &MF);
void mappingImpl(yaml::IO &YamlIO) override;
- ~SIMachineFunctionInfo() = default;
+ ~SIMachineFunctionInfo() override = default;
};
template <> struct MappingTraits<SIMachineFunctionInfo> {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 07264d9..a177a42 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -640,7 +640,7 @@ public:
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
- virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+ bool handleCooperativeAtomic(MachineInstr &MI) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7cce033..05ba76a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
- defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
- defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
}
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -976,10 +976,10 @@ def : GCNPat <
} // End SubtargetPredicate = HasLshlAddU64Inst
let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
}
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6500fce..4ae2c1e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
!if (P.HasModifiers,
- getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+ getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
@@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
- let HasModifiers = 0;
+ let HasNeg = 0;
+ let EnableClamp = 1;
}
let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
@@ -463,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
>;
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {