aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp25
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp25
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp104
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp93
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td31
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td24
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td160
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp76
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td85
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td127
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h10
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td254
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td31
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td124
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td7
-rw-r--r--llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMInstrVFP.td112
-rw-r--r--llvm/lib/Target/ARM/ARMRegisterInfo.td8
-rw-r--r--llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp138
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp12
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp148
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp24
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVGISel.td82
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp152
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp29
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.h19
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td74
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZa.td56
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td17
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td36
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td55
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp12
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp191
-rw-r--r--llvm/lib/Target/Sparc/SparcAsmPrinter.cpp9
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp26
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.h5
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.cpp17
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.td29
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp3
68 files changed, 1836 insertions, 902 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 076a623..639ddcb 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -69,7 +69,6 @@ def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>;
def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
- fconstant_to_constant,
icmp_redundant_trunc,
fold_global_offset,
shuffle_to_extract,
@@ -341,7 +340,7 @@ def AArch64PostLegalizerLowering
: GICombiner<"AArch64PostLegalizerLoweringImpl",
[shuffle_vector_lowering, vashr_vlshr_imm,
icmp_lowering, build_vector_lowering,
- lower_vector_fcmp, form_truncstore,
+ lower_vector_fcmp, form_truncstore, fconstant_to_constant,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mulv2s64,
vector_unmerge_lowering, insertelt_nonconst,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c197550e..9e2d698 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({s32, s64, s128})
- .legalFor(HasFP16, {s16})
+ // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT
+ .legalFor({s16, s32, s64, s128})
.clampScalar(0, MinFPScalar, s128);
// FIXME: fix moreElementsToNextPow2
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 63313da..23dcaea 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -75,6 +75,31 @@ struct ShuffleVectorPseudo {
ShuffleVectorPseudo() = default;
};
+/// Return true if a G_FCONSTANT instruction is known to be better-represented
+/// as a G_CONSTANT.
+bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+ Register DstReg = MI.getOperand(0).getReg();
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ if (DstSize != 16 && DstSize != 32 && DstSize != 64)
+ return false;
+
+ // When we're storing a value, it doesn't matter what register bank it's on.
+ // Since not all floating point constants can be materialized using a fmov,
+ // it makes more sense to just use a GPR.
+ return all_of(MRI.use_nodbg_instructions(DstReg),
+ [](const MachineInstr &Use) { return Use.mayStore(); });
+}
+
+/// Change a G_FCONSTANT into a G_CONSTANT.
+void applyFConstantToConstant(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+ MachineIRBuilder MIB(MI);
+ const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
+ MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
+ MI.eraseFromParent();
+}
+
/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
/// sources of the shuffle are different.
std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 8c10673..896eab5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -44,31 +44,6 @@ namespace {
#include "AArch64GenPreLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES
-/// Return true if a G_FCONSTANT instruction is known to be better-represented
-/// as a G_CONSTANT.
-bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
- assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
- Register DstReg = MI.getOperand(0).getReg();
- const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- if (DstSize != 32 && DstSize != 64)
- return false;
-
- // When we're storing a value, it doesn't matter what register bank it's on.
- // Since not all floating point constants can be materialized using a fmov,
- // it makes more sense to just use a GPR.
- return all_of(MRI.use_nodbg_instructions(DstReg),
- [](const MachineInstr &Use) { return Use.mayStore(); });
-}
-
-/// Change a G_FCONSTANT into a G_CONSTANT.
-void applyFConstantToConstant(MachineInstr &MI) {
- assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
- MachineIRBuilder MIB(MI);
- const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
- MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
- MI.eraseFromParent();
-}
-
/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
/// are sign bits. In this case, we can transform the G_ICMP to directly compare
/// the wide value with a zero.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index f90bcc7..830a35bb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -590,6 +590,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
unsigned Depth) const {
switch (MI.getOpcode()) {
case AArch64::G_DUP:
+ case AArch64::G_SADDLP:
+ case AArch64::G_UADDLP:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -798,6 +800,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (Ty.isVector())
OpRegBankIdx[Idx] = PMI_FirstFPR;
else if (isPreISelGenericFloatingPointOpcode(Opc) ||
+ (MO.isDef() && onlyDefinesFP(MI, MRI, TRI)) ||
+ (MO.isUse() && onlyUsesFP(MI, MRI, TRI)) ||
Ty.getSizeInBits() > 64)
OpRegBankIdx[Idx] = PMI_FirstFPR;
else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1a697f7..ea32748 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2592,6 +2592,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->d16PreservesUnusedBits()">;
+
def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
@@ -2769,6 +2772,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
AssemblerPredicate<(all_of FeatureMAIInsts)>;
+def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">,
+ AssemblerPredicate<(all_of (not FeatureMAIInsts))>;
+
def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
AssemblerPredicate<(all_of FeatureSMemRealTime)>;
@@ -2943,6 +2949,20 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic(
def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;
+def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
+ AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
+
+//===----------------------------------------------------------------------===//
+// HwModes
+//===----------------------------------------------------------------------===//
+
+// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement
+def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>;
+
+// gfx1250, has alignment requirement but no AGPRs.
+def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>;
+
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index cb49936..ef58004 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}
-static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
- for (const auto &CI : IA->ParseConstraints()) {
+/// Compute the minimum number of AGPRs required to allocate the inline asm.
+static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
+ const CallBase &Call) {
+ unsigned ArgNo = 0;
+ unsigned ResNo = 0;
+ unsigned AGPRDefCount = 0;
+ unsigned AGPRUseCount = 0;
+ unsigned MaxPhysReg = 0;
+ const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
+
+ // TODO: Overestimates due to not accounting for tied operands
+ for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+ Type *Ty = nullptr;
+ switch (CI.Type) {
+ case InlineAsm::isOutput: {
+ Ty = Call.getType();
+ if (auto *STy = dyn_cast<StructType>(Ty))
+ Ty = STy->getElementType(ResNo);
+ ++ResNo;
+ break;
+ }
+ case InlineAsm::isInput: {
+ Ty = Call.getArgOperand(ArgNo++)->getType();
+ break;
+ }
+ case InlineAsm::isLabel:
+ continue;
+ case InlineAsm::isClobber:
+ // Parse the physical register reference.
+ break;
+ }
+
for (StringRef Code : CI.Codes) {
- Code.consume_front("{");
- if (Code.starts_with("a"))
- return true;
+ unsigned RegCount = 0;
+ if (Code.starts_with("a")) {
+ // Virtual register, compute number of registers based on the type.
+ //
+ // We ought to be going through TargetLowering to get the number of
+ // registers, but we should avoid the dependence on CodeGen here.
+ RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
+ } else {
+ // Physical register reference
+ auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
+ if (Kind == 'a') {
+ RegCount = NumRegs;
+ MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+ }
+
+ continue;
+ }
+
+ if (CI.Type == InlineAsm::isOutput) {
+ // Apply tuple alignment requirement
+ //
+ // TODO: This is more conservative than necessary.
+ AGPRDefCount = alignTo(AGPRDefCount, RegCount);
+
+ AGPRDefCount += RegCount;
+ if (CI.isEarlyClobber) {
+ AGPRUseCount = alignTo(AGPRUseCount, RegCount);
+ AGPRUseCount += RegCount;
+ }
+ } else {
+ AGPRUseCount = alignTo(AGPRUseCount, RegCount);
+ AGPRUseCount += RegCount;
+ }
}
}
- return false;
+ unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
+
+ // TODO: This is overly conservative. If there are any physical registers,
+ // allocate any virtual registers after them so we don't have to solve optimal
+ // packing.
+ return std::min(MaxVirtReg + MaxPhysReg, 256u);
}
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
const Function *Callee = dyn_cast<Function>(CalleeOp);
if (!Callee) {
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
- return !inlineAsmUsesAGPRs(IA);
+ return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
return false;
}
- // Some intrinsics may use AGPRs, but if we have a choice, we are not
- // required to use AGPRs.
- if (Callee->isIntrinsic())
+ switch (Callee->getIntrinsicID()) {
+ case Intrinsic::not_intrinsic:
+ break;
+ case Intrinsic::write_register:
+ case Intrinsic::read_register:
+ case Intrinsic::read_volatile_register: {
+ const MDString *RegName = cast<MDString>(
+ cast<MDNode>(
+ cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata())
+ ->getOperand(0));
+ auto [Kind, RegIdx, NumRegs] =
+ AMDGPU::parseAsmPhysRegName(RegName->getString());
+ return Kind != 'a';
+ }
+ default:
+ // Some intrinsics may use AGPRs, but if we have a choice, we are not
+ // required to use AGPRs.
return true;
+ }
// TODO: Handle callsite attributes
const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
@@ -1504,7 +1584,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
- A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
CallingConv::ID CC = F->getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CC)) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
@@ -1515,6 +1594,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (!F->isDeclaration() && ST.hasClusters())
A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
+ if (ST.hasGFX90AInsts())
+ A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
+
for (auto &I : instructions(F)) {
Value *Ptr = nullptr;
if (auto *LI = dyn_cast<LoadInst>(&I))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2192a72..e4d328a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -393,12 +393,13 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
switch (N->getMachineOpcode()) {
default: {
- const MCInstrDesc &Desc =
- Subtarget->getInstrInfo()->get(N->getMachineOpcode());
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
unsigned OpIdx = Desc.getNumDefs() + OpNo;
if (OpIdx >= Desc.getNumOperands())
return nullptr;
- int RegClass = Desc.operands()[OpIdx].RegClass;
+
+ int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
if (RegClass == -1)
return nullptr;
@@ -4353,7 +4354,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (!RC || SIRI->isSGPRClass(RC))
return false;
- if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
+ if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
+ RC != &AMDGPU::VS_64_Align2RegClass) {
AllUsesAcceptSReg = false;
SDNode *User = U->getUser();
if (User->isMachineOpcode()) {
@@ -4367,7 +4369,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
const TargetRegisterClass *CommutedRC =
getOperandRegClass(U->getUser(), CommutedOpNo);
if (CommutedRC == &AMDGPU::VS_32RegClass ||
- CommutedRC == &AMDGPU::VS_64RegClass)
+ CommutedRC == &AMDGPU::VS_64RegClass ||
+ CommutedRC == &AMDGPU::VS_64_Align2RegClass)
AllUsesAcceptSReg = true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 848d9a5..557d87f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// vdst, srcA, srcB, srcC
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
OpdsMapping[0] =
- Info->mayNeedAGPRs()
+ Info->getMinNumAGPRs() >= MinNumRegsRequired
? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->mayNeedAGPRs()
+ Info->getMinNumAGPRs() >= MinNumRegsRequired
? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
OpdsMapping[0] =
- Info->mayNeedAGPRs()
+ Info->getMinNumAGPRs() >= MinNumRegsRequired
? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
: getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->mayNeedAGPRs()
+ Info->getMinNumAGPRs() >= MinNumRegsRequired
? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 280fbe2..c7a91f4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -929,8 +929,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
ThinOrFullLTOPhase Phase) {
if (Level != OptimizationLevel::O0) {
if (!isLTOPreLink(Phase)) {
- AMDGPUAttributorOptions Opts;
- MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
+ if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
+ AMDGPUAttributorOptions Opts;
+ MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
+ }
}
}
});
@@ -964,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(InternalizePass(mustPreserveGV));
PM.addPass(GlobalDCEPass());
}
- if (EnableAMDGPUAttributor) {
+ if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
AMDGPUAttributorOptions Opt;
if (HasClosedWorldAssumption)
Opt.IsClosedWorld = true;
@@ -1296,7 +1298,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (LowerCtorDtor)
addPass(createAMDGPUCtorDtorLoweringLegacyPass());
- if (isPassEnabled(EnableImageIntrinsicOptimizer))
+ if (TM.getTargetTriple().isAMDGCN() &&
+ isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
// This can be disabled by passing ::Disable here or on the command line
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a67a7be..a8140c3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1347,6 +1347,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool ForcedDPP = false;
bool ForcedSDWA = false;
KernelScopeInfo KernelScope;
+ const unsigned HwMode;
/// @name Auto-generated Match Functions
/// {
@@ -1356,6 +1357,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
/// }
+ /// Get size of register operand
+ unsigned getRegOperandSize(const MCInstrDesc &Desc, unsigned OpNo) const {
+ assert(OpNo < Desc.NumOperands);
+ int16_t RCID = MII.getOpRegClassID(Desc.operands()[OpNo], HwMode);
+ return getRegBitWidth(RCID) / 8;
+ }
+
private:
void createConstantSymbol(StringRef Id, int64_t Val);
@@ -1442,9 +1450,9 @@ public:
using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>;
AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
- const MCInstrInfo &MII,
- const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) {
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI, MII), Parser(_Parser),
+ HwMode(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)) {
MCAsmParserExtension::Initialize(Parser);
setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
@@ -1944,6 +1952,7 @@ public:
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
void cvtVINTERP(MCInst &Inst, const OperandVector &Operands);
+ void cvtOpSelHelper(MCInst &Inst, unsigned OpSel);
bool parseDimId(unsigned &Encoding);
ParseStatus parseDim(OperandVector &Operands);
@@ -4106,7 +4115,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc) {
if ((DMaskIdx == -1 || TFEIdx == -1) && isGFX10_AEncoding()) // intersect_ray
return true;
- unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
+ unsigned VDataSize = getRegOperandSize(Desc, VDataIdx);
unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
@@ -4170,8 +4179,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) {
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
unsigned ActualAddrSize =
- IsNSA ? SrsrcIdx - VAddr0Idx
- : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
+ IsNSA ? SrsrcIdx - VAddr0Idx : getRegOperandSize(Desc, VAddr0Idx) / 4;
unsigned ExpectedAddrSize =
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
@@ -4181,8 +4189,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) {
ExpectedAddrSize >
getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) {
int VAddrLastIdx = SrsrcIdx - 1;
- unsigned VAddrLastSize =
- AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4;
+ unsigned VAddrLastSize = getRegOperandSize(Desc, VAddrLastIdx) / 4;
ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize;
}
@@ -4428,7 +4435,8 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
return true;
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (TRI->getRegClass(Desc.operands()[0].RegClass).getSizeInBits() <= 128)
+ if (TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[0], HwMode))
+ .getSizeInBits() <= 128)
return true;
if (TRI->regsOverlap(Src2Reg, DstReg)) {
@@ -4999,7 +5007,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) &&
- AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) {
+ AMDGPU::isDPALU_DPP(MII.get(Opc), MII, getSTI())) {
// DP ALU DPP is supported for row_newbcast only on GFX9* and row_share
// only on GFX12.
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
@@ -5522,7 +5530,8 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
unsigned RegSize =
- TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
+ TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[SrcIdx], HwMode))
+ .getSizeInBits();
if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
return true;
@@ -9239,6 +9248,33 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
MCOI::OperandConstraint::TIED_TO) == -1;
}
+void AMDGPUAsmParser::cvtOpSelHelper(MCInst &Inst, unsigned OpSel) {
+ unsigned Opc = Inst.getOpcode();
+ constexpr AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2};
+ constexpr AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+ for (int J = 0; J < 3; ++J) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+ if (OpIdx == -1)
+ // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but
+ // no src1. So continue instead of break.
+ continue;
+
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+
+ if ((OpSel & (1 << J)) != 0)
+ ModVal |= SISrcMods::OP_SEL_0;
+ // op_sel[3] is encoded in src0_modifiers.
+ if (ModOps[J] == AMDGPU::OpName::src0_modifiers && (OpSel & (1 << 3)) != 0)
+ ModVal |= SISrcMods::DST_OP_SEL;
+
+ Inst.getOperand(ModIdx).setImm(ModVal);
+ }
+}
+
void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
{
OptionalImmIndexMap OptionalIdx;
@@ -9275,6 +9311,16 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOModSI);
+
+ // Some v_interp instructions use op_sel[3] for dst.
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOpSel);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ cvtOpSelHelper(Inst, OpSel);
+ }
}
void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
@@ -9310,31 +9356,10 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
if (OpSelIdx == -1)
return;
- const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1,
- AMDGPU::OpName::src2};
- const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
- AMDGPU::OpName::src1_modifiers,
- AMDGPU::OpName::src2_modifiers};
-
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
-
- for (int J = 0; J < 3; ++J) {
- int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
- if (OpIdx == -1)
- break;
-
- int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
- uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
-
- if ((OpSel & (1 << J)) != 0)
- ModVal |= SISrcMods::OP_SEL_0;
- if (ModOps[J] == AMDGPU::OpName::src0_modifiers &&
- (OpSel & (1 << 3)) != 0)
- ModVal |= SISrcMods::DST_OP_SEL;
-
- Inst.getOperand(ModIdx).setImm(ModVal);
- }
+ cvtOpSelHelper(Inst, OpSel);
}
+
void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 09a66d7..b97b738 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -417,10 +417,10 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> {
}
class getMUBUFInsDA<list<RegisterOperand> vdataList,
- list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
+ list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
- RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
- RegisterOperand vdata_op = getBUFVDataRegisterOperandForOp<vdataClass, isTFE>.ret;
+ RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret;
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
@@ -453,8 +453,8 @@ class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit
!if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
(ins))))));
}
@@ -677,8 +677,8 @@ class MUBUF_Pseudo_Store_Lds<string opName>
}
class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset,
- list<RegisterClass> vaddrList=[]> {
- RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ list<RegisterClassLike> vaddrList=[]> {
+ RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
@@ -702,9 +702,9 @@ class getMUBUFAtomicIns<int addrKind,
!if(!eq(addrKind, BUFAddrKind.IdxEn),
getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.BothEn),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
!if(!eq(addrKind, BUFAddrKind.Addr64),
- getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
(ins))))));
}
@@ -1568,11 +1568,12 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
# !if(!eq(RtnMode, "ret"), "", "_noret")
# "_" # vt);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
- defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass;
+ defvar data_op = getVregSrcForVT<data_vt>.ret;
+ defvar data_vt_RC = getVregClassForVT<data_vt>.ret;
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
- data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ data_op:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
Offset:$offset);
def : GCNPat<
(vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)),
@@ -1583,7 +1584,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
>;
defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix)
- data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
+ data_op:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, Offset:$offset);
def : GCNPat<
(vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
@@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(extract_cpol_set_glc $auxiliary),
(extract_cpol $auxiliary));
defvar SrcRC = getVregSrcForVT<vt>.ret;
- defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass;
+ defvar DataRC = getVregClassForVT<data_vt>.ret;
defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3);
@@ -2088,7 +2089,7 @@ defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>;
multiclass MUBUFScratchStorePat_Common <string Instr,
ValueType vt, PatFrag st,
- RegisterClass rc = VGPR_32> {
+ RegisterClassLike rc = VGPR_32> {
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, i32:$offset)),
@@ -2104,7 +2105,7 @@ multiclass MUBUFScratchStorePat_Common <string Instr,
multiclass MUBUFScratchStorePat <string Instr,
ValueType vt, PatFrag st,
- RegisterClass rc = VGPR_32> {
+ RegisterClassLike rc = VGPR_32> {
let SubtargetPredicate = HasUnrestrictedSOffset in {
defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>;
}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index b2ff5a1..d0ad120 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -904,7 +904,7 @@ let SubtargetPredicate = isGFX1250Plus in {
let WaveSizePredicate = isWave32, mayStore = 0 in {
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>;
-defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>;
+defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96_Align1>;
} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>;
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>;
@@ -934,7 +934,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>;
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>;
defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>;
- defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>;
+ defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96_Align1>;
}
//===----------------------------------------------------------------------===//
@@ -951,6 +951,11 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat
(inst $ptr, Offset:$offset, (i1 gds))
>;
+class DSReadPat_t16 <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (EXTRACT_SUBREG (inst $ptr, Offset:$offset, (i1 gds)), lo16)
+>;
+
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
@@ -968,13 +973,14 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
}
- let OtherPredicates = [NotLDSRequiresM0Init] in {
- let True16Predicate = NotUseRealTrue16Insts in {
- def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
- }
- let True16Predicate = UseRealTrue16Insts in {
- def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
- }
+ let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = NotUseRealTrue16Insts in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
+ def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = UseTrue16WithSramECC in {
+ def : DSReadPat_t16<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2120bf8..f11b373 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -57,7 +57,9 @@ static int64_t getInlineImmVal64(unsigned Imm);
AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
MCContext &Ctx, MCInstrInfo const *MCII)
: MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
- MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
+ MAI(*Ctx.getAsmInfo()),
+ HwModeRegClass(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
+ TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {
// ToDo: AMDGPUDisassembler supports only VI ISA.
if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
@@ -825,7 +827,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
+ const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+ if (Desc.TSFlags & SIInstrFlags::MIMG) {
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int RsrcIdx =
@@ -838,7 +841,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
for (unsigned i = 0; i < NSAArgs; ++i) {
const unsigned VAddrIdx = VAddr0Idx + 1 + i;
auto VAddrRCID =
- MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
+ MCII->getOpRegClassID(Desc.operands()[VAddrIdx], HwModeRegClass);
MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
}
Bytes = Bytes.slice(4 * NSAWords);
@@ -1311,7 +1314,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
// Widen the register to the correct number of enabled channels.
MCRegister NewVdata;
if (DstSize != Info->VDataDwords) {
- auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
+ auto DataRCID = MCII->getOpRegClassID(
+ MCII->get(NewOpcode).operands()[VDataIdx], HwModeRegClass);
// Get first subregister of VData
MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg();
@@ -1338,7 +1342,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
- auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
+ auto AddrRCID = MCII->getOpRegClassID(
+ MCII->get(NewOpcode).operands()[VAddrSAIdx], HwModeRegClass);
+
const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID);
NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC);
NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 935c383..2751857 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -41,6 +41,7 @@ private:
std::unique_ptr<MCInstrInfo const> const MCII;
const MCRegisterInfo &MRI;
const MCAsmInfo &MAI;
+ const unsigned HwModeRegClass;
const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 5a22b23..6de59be 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -229,7 +229,7 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
class FLAT_Load_Pseudo<
string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
bit HasSaddr = 0, bit EnableSaddr = 0,
- RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
+ RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
: FLAT_Pseudo<opName, (outs), (ins), ""> {
let OutOperandList = (outs vdata_op:$vdst);
@@ -268,7 +268,7 @@ multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0,
- RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
+ RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
let InOperandList = !con(
(ins VaddrRC:$vaddr, vdataClass:$vdata),
!if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
@@ -385,7 +385,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
(outs ),
!con(
!if(IsAsync, (ins VGPR_32:$vdst), (ins)),
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
(ins flat_offset:$offset, CPol_0:$cpol)),
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
let LGKM_CNT = !not(IsAsync);
@@ -417,7 +417,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P
opName,
(outs ),
!con(
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata),
(ins flat_offset:$offset, CPol_0:$cpol)),
" $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
let VM_CNT = 0;
@@ -511,7 +511,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
-class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> :
FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
let has_vdst = 0;
let has_data = 0;
@@ -533,7 +533,7 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
let is_flat_global = 1, has_saddr = 1 in {
- def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
GlobalSaddrTable<1, opName> {
@@ -754,7 +754,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
RegisterOperand data_op = vdst_op> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
" $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName> {
let FPAtomic = data_vt.isFP;
@@ -786,7 +786,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op_vgpr:$vdst),
- (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
@@ -811,7 +811,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_op_agpr:$vdst),
- (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn_agpr"> {
let FPAtomic = data_vt.isFP;
@@ -837,7 +837,7 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN<
ValueType data_vt = vt,
RegisterOperand data_op = vdst_op,
bit EnableSaddr = false,
- RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
+ RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
: FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> {
let InOperandList = !con(
(ins VaddrRC:$vaddr, data_op:$vdata),
@@ -867,7 +867,7 @@ class FLAT_Global_Atomic_Pseudo_RTN<
RegisterOperand data_op = vdst_op,
bit EnableSaddr = false,
bit IsVGPR = false,
- RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)>
+ RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
: FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> {
defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret);
@@ -1321,7 +1321,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
}
let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
- defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>;
+ defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96_Align1>;
defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>;
}
@@ -1383,6 +1383,11 @@ class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
(inst $vaddr, $offset, (i32 0))
>;
+class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1393,6 +1398,11 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value
(inst $vaddr, $offset, (i32 0))
>;
+class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset, (i32 0)), lo16)
+>;
+
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
(inst $saddr, $voffset, $offset, $cpol, $in)
@@ -1408,6 +1418,11 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
+>;
+
class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
(inst $dsaddr, $vaddr, $offset, $cpol)
@@ -1443,6 +1458,11 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu
(inst $saddr, $voffset, $offset, $cpol)
>;
+class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
+>;
+
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
@@ -1519,7 +1539,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1548,7 +1568,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
defvar rtnNode = !cast<SDPatternOperator>(node);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1592,7 +1612,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
let SubtargetPredicate = inst.SubtargetPredicate;
let OtherPredicates = inst.OtherPredicates;
}
@@ -1625,6 +1645,11 @@ class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $offset, 0)
>;
+class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
@@ -1645,6 +1670,11 @@ class ScratchLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Val
(inst $saddr, $offset, 0)
>;
+class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $saddr, $offset), lo16)
+>;
+
class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)),
@@ -1672,6 +1702,11 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16)
+>;
+
multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> {
def : FlatLoadLDSSignedPat_M0 <inst, node> {
let AddedComplexity = 10;
@@ -1764,6 +1799,16 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value
}
}
+multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatSignedLoadPat_t16<inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatStoreSignedPat <inst, node, vt> {
@@ -1872,8 +1917,8 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
}
}
-multiclass ScratchFLATStorePats_t16<string inst, SDPatternOperator node,
- ValueType vt> {
+multiclass ScratchFLATStorePats_D16_t16<string inst, SDPatternOperator node,
+ ValueType vt> {
def : ScratchStoreSignedPat <!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> {
let AddedComplexity = 25;
}
@@ -1918,6 +1963,21 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
}
+multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : ScratchLoadSignedPat_t16 <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+
+ def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
+}
+
multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadPat <inst, node, vt> {
let OtherPredicates = [HasFlatAddressSpace];
@@ -1947,6 +2007,17 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT
}
}
+multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_t16 <inst, node, vt> {
+ let OtherPredicates = [HasFlatAddressSpace];
+ }
+
+ def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatStorePat <inst, node, vt> {
let OtherPredicates = [HasFlatAddressSpace];
@@ -1997,6 +2068,17 @@ let True16Predicate = NotUseRealTrue16Insts in {
defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
}
+let True16Predicate = UseTrue16WithSramECC in {
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+}
+
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
@@ -2006,11 +2088,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
+
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
defm : FlatStorePats_t16 <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
defm : FlatStorePats_t16 <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
-} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
+}
defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
@@ -2140,6 +2225,20 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
+let True16Predicate = UseTrue16WithSramECC in {
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
+defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
+}
+
let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
@@ -2192,6 +2291,13 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
}
+let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = UseRealTrue16Insts in {
+defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
+defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", store_global, i16>;
+defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
+defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
+}
+
let OtherPredicates = [HasD16LoadStore] in {
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
@@ -2362,14 +2468,24 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
}
-let True16Predicate = UseRealTrue16Insts in {
+let True16Predicate = UseTrue16WithSramECC in {
+defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>;
+}
+
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
-} // End True16Predicate = UseRealTrue16Insts
+} // End OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
+defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
+}
foreach vt = Reg32Types.types in {
defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 8821558..464cbec 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -722,7 +722,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) &&
- AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) {
+ AMDGPU::isDPALU_DPP(TII->get(OrigOp), *TII, *ST)) {
LLVM_DEBUG(dbgs() << " " << OrigMI
<< " failed: not valid 64-bit DPP control value\n");
break;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1d9a427..a911e7e 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -869,7 +869,7 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
int VDataRCID = -1;
if (VDataIdx != -1)
- VDataRCID = Desc.operands()[VDataIdx].RegClass;
+ VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
// There is no hazard if the instruction does not use vector regs
@@ -893,8 +893,8 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
if (TII->isMIMG(MI)) {
int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
- assert(SRsrcIdx != -1 &&
- AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
+ assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
+ Desc.operands()[SRsrcIdx])) == 256);
(void)SRsrcIdx;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index d3b5718..e82f998 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -788,9 +788,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
// Check if operand register class contains register used.
// Intention: print disassembler message when invalid code is decoded,
// for example sgpr register used in VReg or VISrc(VReg or imm) operand.
- int RCID = Desc.operands()[OpNo].RegClass;
+ const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
+ int16_t RCID = MII.getOpRegClassID(
+ OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
if (RCID != -1) {
- const MCRegisterClass RC = MRI.getRegClass(RCID);
+ const MCRegisterClass &RC = MRI.getRegClass(RCID);
auto Reg = mc2PseudoReg(Op.getReg());
if (!RC.contains(Reg) && !isInlineValue(Reg)) {
O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
@@ -1025,7 +1027,7 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) &&
- AMDGPU::isDPALU_DPP(Desc, STI)) {
+ AMDGPU::isDPALU_DPP(Desc, MII, STI)) {
O << " /* DP ALU dpp only supports "
<< (isGFX12(STI) ? "row_share" : "row_newbcast") << " */";
return;
@@ -1280,6 +1282,17 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
(ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
+ // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but no
+ // src1.
+ if (NumOps == 1 && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2) &&
+ !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1)) {
+ Ops[NumOps++] = DefaultValue; // Set src1_modifiers to default.
+ int Mod2Idx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
+ assert(Mod2Idx != -1);
+ Ops[NumOps++] = MI->getOperand(Mod2Idx).getImm();
+ }
+
const bool HasDst =
(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst) != -1) ||
(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst) != -1);
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 291c03a..64e34db 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1516,7 +1516,8 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11)));
RegisterOperand RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
- defvar Size = !cast<SIRegisterClass>(RegClass.RegClass).Size;
+ defvar Size = !cast<SIRegisterClassLike>(RegClass.RegClass).Size;
+
int VAddrDwords = !srl(Size, 5);
int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 90c828b..6616b30 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1077,7 +1077,7 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
+ int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
if (RCID == -1)
return false;
@@ -1299,10 +1299,8 @@ void SIFoldOperandsImpl::foldOperand(
AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
const MCInstrDesc &MovDesc = TII->get(MovOp);
- assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
-
const TargetRegisterClass *MovDstRC =
- TRI->getRegClass(MovDesc.operands()[0].RegClass);
+ TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
// Fold if the destination register class of the MOV instruction (ResRC)
// is a superclass of (or equal to) the destination register class of the
@@ -1312,7 +1310,8 @@ void SIFoldOperandsImpl::foldOperand(
const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
const TargetRegisterClass *MovSrcRC =
- TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
+ TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
+
if (MovSrcRC) {
if (UseSubReg)
MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e233457..1a686a9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17346,74 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineFunction *MF = MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
TII->legalizeOperandsVOP3(MRI, MI);
- // Prefer VGPRs over AGPRs in mAI instructions where possible.
- // This saves a chain-copy of registers and better balance register
- // use between vgpr and agpr as agpr tuples tend to be big.
- if (!MI.getDesc().operands().empty()) {
- unsigned Opc = MI.getOpcode();
- bool HasAGPRs = Info->mayNeedAGPRs();
- const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- for (auto I :
- {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
- if (I == -1)
- break;
- if ((I == Src2Idx) && (HasAGPRs))
- break;
- MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !Op.getReg().isVirtual())
- continue;
- auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
- if (!TRI->hasAGPRs(RC))
- continue;
- auto *Src = MRI.getUniqueVRegDef(Op.getReg());
- if (!Src || !Src->isCopy() ||
- !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
- continue;
- auto *NewRC = TRI->getEquivalentVGPRClass(RC);
- // All uses of agpr64 and agpr32 can also accept vgpr except for
- // v_accvgpr_read, but we do not produce agpr reads during selection,
- // so no use checks are needed.
- MRI.setRegClass(Op.getReg(), NewRC);
- }
-
- if (TII->isMAI(MI)) {
- // The ordinary src0, src1, src2 were legalized above.
- //
- // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
- // as a separate instruction.
- int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::scale_src0);
- if (Src0Idx != -1) {
- int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::scale_src1);
- if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
- TII->usesConstantBus(MRI, MI, Src1Idx))
- TII->legalizeOpWithMove(MI, Src1Idx);
- }
- }
-
- if (!HasAGPRs)
- return;
-
- // Resolve the rest of AV operands to AGPRs.
- if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
- if (Src2->isReg() && Src2->getReg().isVirtual()) {
- auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
- if (TRI->isVectorSuperClass(RC)) {
- auto *NewRC = TRI->getEquivalentAGPRClass(RC);
- MRI.setRegClass(Src2->getReg(), NewRC);
- if (Src2->isTied())
- MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
- }
- }
+ if (TII->isMAI(MI)) {
+ // The ordinary src0, src1, src2 were legalized above.
+ //
+ // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
+ // as a separate instruction.
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::scale_src0);
+ if (Src0Idx != -1) {
+ int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::scale_src1);
+ if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
+ TII->usesConstantBus(MRI, MI, Src1Idx))
+ TII->legalizeOpWithMove(MI, Src1Idx);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 46757cf..ec5c5bb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5029,9 +5029,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
- int RegClass = Desc.operands()[i].RegClass;
-
const MCOperandInfo &OpInfo = Desc.operands()[i];
+ int16_t RegClass = getOpRegClassID(OpInfo);
+
switch (OpInfo.OperandType) {
case MCOI::OPERAND_REGISTER:
if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
@@ -5635,7 +5635,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
!AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
- AMDGPU::isDPALU_DPP(Desc, ST)) {
+ AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
ErrInfo = "Invalid dpp_ctrl value: "
"DP ALU dpp only support row_newbcast";
return false;
@@ -6031,48 +6031,17 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
}
-static const TargetRegisterClass *
-adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
- const MCInstrDesc &TID, unsigned RCID) {
- if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
- switch (RCID) {
- case AMDGPU::AV_32RegClassID:
- RCID = AMDGPU::VGPR_32RegClassID;
- break;
- case AMDGPU::AV_64RegClassID:
- RCID = AMDGPU::VReg_64RegClassID;
- break;
- case AMDGPU::AV_96RegClassID:
- RCID = AMDGPU::VReg_96RegClassID;
- break;
- case AMDGPU::AV_128RegClassID:
- RCID = AMDGPU::VReg_128RegClassID;
- break;
- case AMDGPU::AV_160RegClassID:
- RCID = AMDGPU::VReg_160RegClassID;
- break;
- case AMDGPU::AV_512RegClassID:
- RCID = AMDGPU::VReg_512RegClassID;
- break;
- default:
- break;
- }
- }
-
- return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
-}
-
+// FIXME: This should not be an overridable function. All subtarget dependent
+// operand modifications should go through isLookupRegClassByHwMode in the
+// generic handling.
const TargetRegisterClass *
SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
if (OpNum >= TID.getNumOperands())
return nullptr;
- auto RegClass = TID.operands()[OpNum].RegClass;
- // Special pseudos have no alignment requirement.
- if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
- return RI.getRegClass(RegClass);
-
- return adjustAllocatableRegClass(ST, RI, TID, RegClass);
+ const MCOperandInfo &OpInfo = TID.operands()[OpNum];
+ int16_t RegClass = getOpRegClassID(OpInfo);
+ return RI.getRegClass(RegClass);
}
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -6090,8 +6059,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
return RI.getPhysRegBaseClass(Reg);
}
- unsigned RCID = Desc.operands()[OpNo].RegClass;
- return adjustAllocatableRegClass(ST, RI, Desc, RCID);
+ return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6099,7 +6067,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
+ unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
const TargetRegisterClass *RC = RI.getRegClass(RCID);
unsigned Size = RI.getRegSizeInBits(*RC);
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
@@ -6168,7 +6136,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
Register Reg = MO.getReg();
- const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
+ const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
if (Reg.isPhysical())
return DRC->contains(Reg);
@@ -6293,8 +6261,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
+ int64_t RegClass = getOpRegClassID(OpInfo);
const TargetRegisterClass *DefinedRC =
- OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+ RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
if (!MO)
MO = &MI.getOperand(OpIdx);
@@ -7619,7 +7588,7 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
if (!RI.isVGPRClass(CurrRC))
return;
- unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
Op.setSubReg(AMDGPU::lo16);
@@ -9323,7 +9292,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
// Is this operand statically required to be an SGPR based on the operand
// constraints?
const TargetRegisterClass *OpRC =
- RI.getRegClass(Desc.operands()[Idx].RegClass);
+ RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
if (IsRequiredSGPR)
return MO.getReg();
@@ -9804,7 +9773,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
if (Idx == -1) // e.g. s_memtime
return false;
- const auto RCID = MI.getDesc().operands()[Idx].RegClass;
+ const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index cc59acf..e979eeb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1298,7 +1298,7 @@ public:
return 4;
}
- return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
+ return RI.getRegSizeInBits(*RI.getRegClass(getOpRegClassID(OpInfo))) / 8;
}
/// This form should usually be preferred since it handles operands
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 18a5393..b7f63ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1151,6 +1151,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
+// FIXME: Should change class based on hasSDWAScalar to exclude SGPRs
class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
string Type = !if(vt.isFP, "FP", "INT");
@@ -1807,13 +1808,13 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16,
VOPDstOperand_t16Lo128),
VOPDstOperand<VGPR_32>);
- RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>,
- !eq(VT.Size, 512) : VOPDstOperand<VReg_512>,
- !eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
- !eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
- !eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
- !eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
- !eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024_AlignTarget>,
+ !eq(VT.Size, 512) : VOPDstOperand<VReg_512_AlignTarget>,
+ !eq(VT.Size, 256) : VOPDstOperand<VReg_256_AlignTarget>,
+ !eq(VT.Size, 192) : VOPDstOperand<VReg_192_AlignTarget>,
+ !eq(VT.Size, 128) : VOPDstOperand<VReg_128_AlignTarget>,
+ !eq(VT.Size, 96) : VOPDstOperand<VReg_96_AlignTarget>,
+ !eq(VT.Size, 64) : VOPDstOperand<VReg_64_AlignTarget>,
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
!eq(VT.Size, 16) : op16,
1 : VOPDstS64orS32); // else VT == i1
@@ -1821,8 +1822,8 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
class getVALUDstForVT_fake16<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
- !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
- !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+ !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128_AlignTarget>,
+ !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64_AlignTarget>,
!if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32_Lo128>,
VOPDstS64orS32)))); // else VT == i1
}
@@ -1890,21 +1891,38 @@ class getSOPSrcForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
}
-// Returns the vreg register class to use for source operand given VT
+// Returns the vreg register operand to use for source operand given VT.
+// This should only be used for a target instruction's ins list.
class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> {
RegisterOperand ret =
- !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>,
- !eq(VT.Size, 192) : RegisterOperand<VReg_192>,
- !eq(VT.Size, 128) : RegisterOperand<VReg_128>,
- !eq(VT.Size, 96) : RegisterOperand<VReg_96>,
- !eq(VT.Size, 64) : RegisterOperand<VReg_64>,
- !eq(VT.Size, 48) : RegisterOperand<VReg_64>,
+ !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512_AlignTarget>,
+ !eq(VT.Size, 192) : RegisterOperand<VReg_192_AlignTarget>,
+ !eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>,
+ !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>,
+ !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>,
+ !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>,
!eq(VT.Size, 16) : !if(IsTrue16,
!if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128),
RegisterOperand<VGPR_32>),
1 : RegisterOperand<VGPR_32>);
}
+// Returns a concrete vgpr register class to use for a value type VT,
+// which exists separately from a real instruction use.
+class getVregClassForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> {
+ RegisterClass ret =
+ !cond(!eq(VT.Size, 512) : VReg_512,
+ !eq(VT.Size, 192) : VReg_192,
+ !eq(VT.Size, 128) : VReg_128,
+ !eq(VT.Size, 96) : VReg_96,
+ !eq(VT.Size, 64) : VReg_64,
+ !eq(VT.Size, 48) : VReg_64,
+ !eq(VT.Size, 16) : !if(IsTrue16,
+ !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128),
+ VGPR_32),
+ 1 : VGPR_32);
+}
+
class getSDWASrcForVT <ValueType VT> {
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
@@ -2638,7 +2656,7 @@ class getAlign2RegOp<RegisterOperand RC> {
}
class getEquivalentAGPROperand<RegisterOperand RC> {
- defvar Size = !cast<RegisterClass>(RC.RegClass).Size;
+ defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size;
RegisterOperand ret =
!cond(!eq(Size, 32) : RegisterOperand<AGPR_32>,
!eq(Size, 64) : RegisterOperand<AReg_64>,
@@ -2649,16 +2667,33 @@ class getEquivalentAGPROperand<RegisterOperand RC> {
}
class getEquivalentVGPROperand<RegisterOperand RC> {
- defvar Size = !cast<RegisterClass>(RC.RegClass).Size;
+ defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size;
RegisterOperand ret =
- !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>,
- !eq(Size, 64) : RegisterOperand<VReg_64>,
- !eq(Size, 96) : RegisterOperand<VReg_96>,
- !eq(Size, 128) : RegisterOperand<VReg_128>,
- !eq(Size, 160) : RegisterOperand<VReg_160>,
- !eq(Size, 1024) : RegisterOperand<VReg_1024>);
+ !cond(
+ !eq(RC, VGPROp_32) : VGPROp_32,
+ !eq(RC, VGPROp_64) : VGPROp_64,
+
+ !eq(RC, AVLdSt_32) : VGPROp_32,
+ !eq(RC, AVLdSt_64) : VGPROp_64,
+ !eq(RC, AVLdSt_96) : VGPROp_96,
+ !eq(RC, AVLdSt_128) : VGPROp_128,
+ !eq(RC, AVLdSt_160) : VGPROp_160,
+ !eq(RC, AVLdSt_1024) : VGPROp_1024,
+
+ !eq(RC, AVLdSt_64_Align2) : VGPROp_64_Align2,
+ !eq(RC, AVLdSt_96_Align2) : VGPROp_96_Align2,
+ !eq(RC, AVLdSt_128_Align2) : VGPROp_128_Align2,
+ !eq(RC, AVLdSt_160_Align2) : VGPROp_160_Align2,
+ !eq(RC, AVLdSt_1024_Align2) : VGPROp_1024_Align2,
+
+ !eq(RC, AVLdSt_64_Align1) : VGPROp_64_Align1,
+ !eq(RC, AVLdSt_96_Align1) : VGPROp_96_Align1,
+ !eq(RC, AVLdSt_128_Align1) : VGPROp_128_Align1,
+ !eq(RC, AVLdSt_160_Align1) : VGPROp_160_Align1,
+ !eq(RC, AVLdSt_1024_Align1) : VGPROp_1024_Align1);
}
+
class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32, ValueType Src2VT = i32> {
bit ret = !if(!eq(DstVT.Size, 64),
@@ -3190,7 +3225,7 @@ class Commutable_REV <string revOp, bit isOrig> {
// Interpolation opcodes
//===----------------------------------------------------------------------===//
-class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">;
+class VINTRPDstOperand <RegisterClassLike rc> : RegisterOperand <rc, "printVINTRPDst">;
class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
VINTRPCommon <outs, ins, "", pattern>,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index be084a9..eac9fd4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -120,6 +120,8 @@ def ATOMIC_FENCE : SPseudoInstSI<
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
+// No align needed as it will be decomposed anyway
+// TODO: Remove alignment requirement from sources
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;
@@ -129,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
(ins VSrc_b64:$src0)> {
let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
@@ -163,9 +165,6 @@ def AV_MOV_B32_IMM_PSEUDO
// 64-bit materialize immediate which supports AGPR or VGPR. This has
// an unusual operand restriction which requires the two halves of the
// immediate to each be 32-bit inline immediate values.
-//
-// FIXME: This unnecessarily has the even aligned vector register
-// requirement applied.
def AV_MOV_B64_IMM_PSEUDO
: VPseudoInstSI<(outs AV_64:$vdst), (ins AV_64_PSEUDO_IMM:$src0)> {
let isReMaterializable = 1;
@@ -381,13 +380,13 @@ foreach Op = Operations in {
let usesCustomInserter = 1, Defs = [VCC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
- (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
- [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))]
+ (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))]
>;
def V_SUB_U64_PSEUDO : VPseudoInstSI <
- (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
- [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))]
+ (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))]
>;
} // End usesCustomInserter = 1, Defs = [VCC]
@@ -1142,7 +1141,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class,
+multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class,
bit UsesTmp = 0, bit HasMask = 0> {
let UseNamedOperandTable = 1, Spill = 1, VALU = 1,
SchedRW = [WriteVMEM] in {
@@ -1177,21 +1176,25 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class,
} // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM]
}
+// TODO: Technically the AlignTarget register class constraint is
+// overly conservative for gfx90a. There is an alignment requirement,
+// but the underlying spill will be lowered to 32-bit accesses.
+
defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>;
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
-defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
-defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
-defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
-defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
-defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
-defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
-defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
-defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>;
-defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>;
-defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>;
-defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
-defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
-defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64_AlignTarget>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96_AlignTarget>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128_AlignTarget>;
+defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160_AlignTarget>;
+defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192_AlignTarget>;
+defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224_AlignTarget>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256_AlignTarget>;
+defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288_AlignTarget>;
+defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320_AlignTarget>;
+defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352_AlignTarget>;
+defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384_AlignTarget>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512_AlignTarget>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024_AlignTarget>;
let Defs = [M0] in {
// Spills a block of 32 VGPRs. M0 will contain a mask describing which
@@ -1200,34 +1203,34 @@ let Defs = [M0] in {
}
defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>;
-defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>;
-defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
-defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
-defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
-defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
-defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
-defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
-defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>;
-defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>;
-defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>;
-defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>;
-defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
-defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
+defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64_AlignTarget, 1>;
+defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96_AlignTarget, 1>;
+defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128_AlignTarget, 1>;
+defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160_AlignTarget, 1>;
+defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192_AlignTarget, 1>;
+defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224_AlignTarget, 1>;
+defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256_AlignTarget, 1>;
+defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288_AlignTarget, 1>;
+defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320_AlignTarget, 1>;
+defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352_AlignTarget, 1>;
+defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384_AlignTarget, 1>;
+defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512_AlignTarget, 1>;
+defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024_AlignTarget, 1>;
defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>;
-defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>;
-defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>;
-defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
-defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
-defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
-defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
-defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
-defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>;
-defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>;
-defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>;
-defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
-defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
-defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
+defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64_AlignTarget, 1>;
+defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96_AlignTarget, 1>;
+defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128_AlignTarget, 1>;
+defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160_AlignTarget, 1>;
+defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192_AlignTarget, 1>;
+defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224_AlignTarget, 1>;
+defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256_AlignTarget, 1>;
+defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288_AlignTarget, 1>;
+defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320_AlignTarget, 1>;
+defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352_AlignTarget, 1>;
+defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384_AlignTarget, 1>;
+defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512_AlignTarget, 1>;
+defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024_AlignTarget, 1>;
let isConvergent = 1 in {
defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
@@ -2383,18 +2386,24 @@ let True16Predicate = UseRealTrue16Insts in {
}
}
-// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
-// immediate and wil be expanded as needed, but we will only use these patterns
-// for values which can be encoded.
-def : GCNPat <
- (VGPRImm<(i64 imm)>:$imm),
- (V_MOV_B64_PSEUDO imm:$imm)
->;
+/// FIXME: Increasing the priority of VGPRImm over the scalar forms as
+/// a workaround for a phase ordering problem caused by overly
+/// conservative MachineCSE. If we end up with an s_mov_b64 + copy to
+/// vgpr pattern, MachineCSE will not perform the CSE which occurs
+/// after operand folding.
+let AddedComplexity = 1 in {
+ // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
+ // immediate and wil be expanded as needed, but we will only use these patterns
+ // for values which can be encoded.
+ def : GCNPat <
+ (VGPRImm<(i64 imm)>:$imm),
+ (V_MOV_B64_PSEUDO imm:$imm)>;
-def : GCNPat <
- (VGPRImm<(f64 fpimm)>:$imm),
- (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
->;
+ def : GCNPat <
+ (VGPRImm<(f64 fpimm)>:$imm),
+ (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
+ >;
+} // End let AddedComplexity = 2
def : GCNPat <
(i64 imm:$imm),
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 908d856..b398db4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -33,17 +33,20 @@ using namespace llvm;
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
// where it is better to produce the VGPR form (e.g. if there are VGPR users
// of the MFMA result).
-static cl::opt<bool> MFMAVGPRForm(
- "amdgpu-mfma-vgpr-form", cl::Hidden,
+static cl::opt<bool, true> MFMAVGPRFormOpt(
+ "amdgpu-mfma-vgpr-form",
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
- cl::init(false));
+ cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false),
+ cl::Hidden);
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
}
+bool SIMachineFunctionInfo::MFMAVGPRForm = false;
+
SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
const GCNSubtarget *STI)
: AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
@@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts();
if (ST.hasGFX90AInsts()) {
- // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
- // should be separated from availability of AGPRs
- if (MFMAVGPRForm ||
- (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
- !mayUseAGPRs(F)))
- MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
+ // allocation granule and clamping.
+ auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
+ /*OnlyFirstRequired=*/true);
+ MinNumAGPRs = MinNumAGPRAttr;
}
if (AMDGPU::isChainCC(CC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 4560615..b7dbb59 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -509,7 +509,9 @@ private:
// user arguments. This is an offset from the KernargSegmentPtr.
bool ImplicitArgPtr : 1;
- bool MayNeedAGPRs : 1;
+ /// Minimum number of AGPRs required to allocate in the function. Only
+ /// relevant for gfx90a-gfx950. For gfx908, this should be infinite.
+ unsigned MinNumAGPRs = ~0u;
// The hard-wired high half of the address of the global information table
// for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
@@ -537,6 +539,8 @@ private:
void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
public:
+ static bool MFMAVGPRForm;
+
struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
bool FullyAllocated = false;
@@ -1196,9 +1200,7 @@ public:
unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
- bool mayNeedAGPRs() const {
- return MayNeedAGPRs;
- }
+ unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index afe76e1..bfac639 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1338,8 +1338,9 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
continue;
unsigned I = Op.getOperandNo();
- if (Desc.operands()[I].RegClass == -1 ||
- !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
+
+ int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
+ if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
continue;
if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3115579..be1c883 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -328,7 +328,8 @@ struct SGPRSpillBuilder {
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
ST.getAMDGPUDwarfFlavour(),
- /*PC=*/0, ST.getHwMode()),
+ /*PC=*/0,
+ ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 82fc240..fc8f46a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -91,16 +91,23 @@ class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0,
int Index = !cast<int>(regIdx);
}
-// For register classes that use TSFlags.
-class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
- : RegisterClass <n, rTypes, Align, rList> {
+class SIRegisterClassLike<int BW = 0, bit V = false,
+ bit A = false,
+ bit S = false> {
+ // Bitwidth of the register
+ field int Size = BW;
+
// For vector register classes.
- field bit HasVGPR = 0;
- field bit HasAGPR = 0;
+ field bit HasVGPR = V;
+ field bit HasAGPR = A;
// For scalar register classes.
- field bit HasSGPR = 0;
+ field bit HasSGPR = S;
+}
+// For register classes that use TSFlags.
+class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
+ : RegisterClass <n, rTypes, Align, rList>, SIRegisterClassLike {
// Alignment of the first register in tuple (in 32-bit units).
field int RegTupleAlignUnits = 1;
@@ -991,7 +998,8 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1, BaseClassPriority = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1,
+ DecoderMethod = "DecodeVReg_"#!mul(numRegs, 32)#"RegisterClass" in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1031,7 +1039,8 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1,
+ DecoderMethod = "DecodeAReg_"#!mul(numRegs, 32)#"RegisterClass" in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1197,15 +1206,87 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102
}
//===----------------------------------------------------------------------===//
-// Register operands
+//
+// AlignTarget classes. Artifical classes to swap between
+// even-aligned and any-aligned classes depending on subtarget.
+//
//===----------------------------------------------------------------------===//
+def AV_LdSt_32_Target : RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> {
+ let DecoderMethod = "decodeAVLdSt";
+}
+
+foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in {
+ def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
+ let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass";
+ }
+
+ def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/],
+ [!cast<RegisterClass>("AReg_"#RegSize),
+ !cast<RegisterClass>("AReg_"#RegSize#_Align2)
+ /*Unused combination*/]> {
+ let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass";
+ }
+
+ def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [!cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
+ let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass";
+ }
+
+ def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
+ let DecoderMethod = "decodeAVLdSt";
+ }
+
+ def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [!cast<RegisterClass>("VReg_"#RegSize#_Align2),
+ !cast<RegisterClass>("AV_"#RegSize#_Align2),
+ !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
+ let DecoderMethod = "decodeAVLdSt";
+ }
+
+ def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [!cast<RegisterClass>("VReg_"#RegSize),
+ !cast<RegisterClass>("AV_"#RegSize),
+ !cast<RegisterClass>("VReg_"#RegSize)]> {
+ let DecoderMethod = "decodeAVLdSt";
+ }
+}
+
+def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
+ RegClassByHwMode<
+ [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+ [VS_64, VS_64_Align2, VS_64_Align2]> {
+ let DecoderMethod = "decodeSrcRegOrImm9";
+}
+
class RegImmMatcher<string name> : AsmOperandClass {
let Name = name;
let RenderMethod = "addRegOrImmOperands";
}
-class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
+class RegOrImmOperand <RegisterClassLike RegClass, string OperandTypeName>
: RegisterOperand<RegClass> {
let OperandNamespace = "AMDGPU";
let OperandType = OperandTypeName;
@@ -1213,14 +1294,18 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
}
//===----------------------------------------------------------------------===//
+// Register operands
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate
// if supported by target.
//===----------------------------------------------------------------------===//
-class SrcRegOrImm9<RegisterClass regClass, string operandType>
+class SrcRegOrImm9<RegisterClassLike regClass, string operandType>
: RegOrImmOperand<regClass, operandType> {
string DecoderMethodName = "decodeSrcRegOrImm9";
- let DecoderMethod = DecoderMethodName # "<" # regClass.Size # ">";
+ let DecoderMethod = DecoderMethodName # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
}
class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16>
@@ -1277,12 +1362,12 @@ def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">;
def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">;
def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">;
def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">;
-def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_INT64">;
-def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> {
+def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">;
+def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> {
let DecoderMethod = "decodeOperand_VSrc_f64";
}
-def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">;
-def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">;
+def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">;
+def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">;
def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">;
@@ -1292,19 +1377,19 @@ def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16
// This is for operands with the enum(9), VSrc encoding restriction,
// but only allows VGPRs.
-class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> {
- let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">";
+class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> {
+ let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
}
def VRegSrc_32 : SrcReg9<VGPR_32>;
-def VRegSrc_64 : SrcReg9<VReg_64>;
-def VRegSrc_96 : SrcReg9<VReg_96>;
-def VRegSrc_128 : SrcReg9<VReg_128>;
-def VRegSrc_192 : SrcReg9<VReg_192>;
-def VRegSrc_256 : SrcReg9<VReg_256>;
-def VRegSrc_384 : SrcReg9<VReg_384>;
-def VRegSrc_512 : SrcReg9<VReg_512>;
-def VRegSrc_1024 : SrcReg9<VReg_1024>;
+def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>;
+def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>;
+def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>;
+def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>;
+def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>;
+def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>;
+def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>;
+def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
// True 16 Operands
@@ -1325,23 +1410,23 @@ class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> {
class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> {
let DecoderMethod = "Decode" # regClass # "RegisterClass";
}
-multiclass VGPROp_Aligned<RegisterClass regClass> {
- def _Align1 : VGPROp<regClass>;
- def _Align2 : VGPROp_Align2<regClass>;
-}
// TODO: These cases should use default target alignment
def VGPROp_16 : VGPROp<VGPR_16> {
let EncoderMethod = "getMachineOpValueT16";
}
+
def VGPROp_32 : VGPROp<VGPR_32>;
foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "320", "352", "384", "512", "1024"] in {
- def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>;
-}
+ // Target default alignment
+ def VGPROp_#size : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_AlignTarget)>;
+
+ // No alignment requirement
+ def VGPROp_#size#_Align1 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size)>;
-foreach size = ["64", "96", "128", "160", "256", "1024"] in {
- defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>;
+ // Always even alignment requirement
+ def VGPROp_#size#_Align2 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_Align2)>;
}
def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
@@ -1357,9 +1442,9 @@ def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
// ASrc_* Operands with an AccVGPR
//===----------------------------------------------------------------------===//
-class AVOperand<RegisterClass regClass, string decoder>
+class AVOperand<RegisterClassLike regClass, string decoder>
: RegisterOperand<regClass> {
- let DecoderMethod = decoder # "<" # regClass.Size # ">";
+ let DecoderMethod = decoder # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
let EncoderMethod = "getAVOperandEncoding";
}
@@ -1374,13 +1459,13 @@ def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
-def VCSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_INT64">;
-def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
+def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
-def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
+def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">;
def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
// True 16 Operands
@@ -1391,73 +1476,80 @@ def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">;
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
-def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
//===----------------------------------------------------------------------===//
// AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
//===----------------------------------------------------------------------===//
-class AVSrcOperand<RegisterClass regClass>
+class AVSrcOperand<RegisterClassLike regClass>
: AVOperand<regClass, "decodeSrcAV10">;
def AVSrc_32 : AVSrcOperand<AV_32>;
-def AVSrc_64 : AVSrcOperand<AV_64>;
-def AVSrc_128 : AVSrcOperand<AV_128>;
-def AVSrc_192 : AVSrcOperand<AV_192>;
-def AVSrc_256 : AVSrcOperand<AV_256>;
+def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>;
+def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>;
+def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>;
+def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>;
-class AVDstOperand<RegisterClass regClass>
+def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>;
+def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>;
+def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>;
+def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>;
+
+class AVDstOperand<RegisterClassLike regClass>
: AVOperand<regClass, "decodeAV10">;
def AVDst_128 : AVDstOperand<AV_128>;
def AVDst_256 : AVDstOperand<AV_256>;
def AVDst_512 : AVDstOperand<AV_512>;
-class AVLdStOperand<RegisterClass regClass>
+def AVDst_128_Align2 : AVDstOperand<AV_128_Align2>;
+def AVDst_256_Align2 : AVDstOperand<AV_256_Align2>;
+def AVDst_512_Align2 : AVDstOperand<AV_512_Align2>;
+
+class AVLdStOperand<RegisterClassLike regClass>
: AVOperand<regClass, "decodeAVLdSt">;
-def AVLdSt_32 : AVLdStOperand<AV_32>;
+def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>;
foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
- // TODO: These cases should use target align variant
- def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
-
- def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
- def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>;
+ def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>;
+ def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>;
+ def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>;
}
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
-class SrcRegOrImmA9<RegisterClass regClass, string operandType>
+class SrcRegOrImmA9<RegisterClassLike regClass, string operandType>
: RegOrImmOperand<regClass, operandType> {
- let DecoderMethod = "decodeSrcRegOrImmA9<" # regClass.Size # ">";
+ let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
}
-def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
//===----------------------------------------------------------------------===//
// Tablegen programming utilities
@@ -1467,10 +1559,10 @@ def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">;
/// instruction's operand list, which may be a RegisterOperand or a
/// direct RegisterClass reference.
class getRegClassFromOp<DAGOperand Op> {
- SIRegisterClass ret = !if(
+ SIRegisterClassLike ret = !if(
!isa<RegisterOperand>(Op),
- !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass),
- !cast<SIRegisterClass>(Op));
+ !cast<SIRegisterClassLike>(!cast<RegisterOperand>(Op).RegClass),
+ !cast<SIRegisterClassLike>(Op));
}
/// Check if the operand will use an AV_* class.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f7f4d46..76023d2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1569,12 +1569,7 @@ static bool isValidRegPrefix(char C) {
return C == 'v' || C == 's' || C == 'a';
}
-std::tuple<char, unsigned, unsigned>
-parseAsmConstraintPhysReg(StringRef Constraint) {
- StringRef RegName = Constraint;
- if (!RegName.consume_front("{") || !RegName.consume_back("}"))
- return {};
-
+std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef RegName) {
char Kind = RegName.front();
if (!isValidRegPrefix(Kind))
return {};
@@ -1601,6 +1596,14 @@ parseAsmConstraintPhysReg(StringRef Constraint) {
return {};
}
+std::tuple<char, unsigned, unsigned>
+parseAsmConstraintPhysReg(StringRef Constraint) {
+ StringRef RegName = Constraint;
+ if (!RegName.consume_front("{") || !RegName.consume_back("}"))
+ return {};
+ return parseAsmPhysRegName(RegName);
+}
+
std::pair<unsigned, unsigned>
getIntegerPairAttribute(const Function &F, StringRef Name,
std::pair<unsigned, unsigned> Default,
@@ -2927,13 +2930,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
return getRegBitWidth(RC.getID());
}
-unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
- unsigned OpNo) {
- assert(OpNo < Desc.NumOperands);
- unsigned RCID = Desc.operands()[OpNo].RegClass;
- return getRegBitWidth(RCID) / 8;
-}
-
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
if (isInlinableIntLiteral(Literal))
return true;
@@ -3499,14 +3495,18 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
return false;
}
-bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
+bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
+ const MCSubtargetInfo &ST) {
for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
if (Idx == -1)
continue;
- if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID ||
- OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID)
+ const MCOperandInfo &OpInfo = OpDesc.operands()[Idx];
+ int16_t RegClass = MII.getOpRegClassID(
+ OpInfo, ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
+ if (RegClass == AMDGPU::VReg_64RegClassID ||
+ RegClass == AMDGPU::VReg_64_Align2RegClassID)
return true;
}
@@ -3533,14 +3533,15 @@ bool isDPALU_DPP32BitOpc(unsigned Opc) {
}
}
-bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
+ const MCSubtargetInfo &ST) {
if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP))
return false;
if (isDPALU_DPP32BitOpc(OpDesc.getOpcode()))
return ST.hasFeature(AMDGPU::FeatureGFX1250Insts);
- return hasAny64BitVGPROperands(OpDesc);
+ return hasAny64BitVGPROperands(OpDesc, MII, ST);
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2b9c063..49b4d02 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1014,6 +1014,13 @@ bool isReadOnlySegment(const GlobalValue *GV);
bool shouldEmitConstantsToTextSection(const Triple &TT);
/// Returns a valid charcode or 0 in the first entry if this is a valid physical
+/// register name. Followed by the start register number, and the register
+/// width. Does not validate the number of registers exists in the class. Unlike
+/// parseAsmConstraintPhysReg, this does not expect the name to be wrapped in
+/// "{}".
+std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef TupleString);
+
+/// Returns a valid charcode or 0 in the first entry if this is a valid physical
/// register constraint. Followed by the start register number, and the register
/// width. Does not validate the number of registers exists in the class.
std::tuple<char, unsigned, unsigned>
@@ -1620,10 +1627,6 @@ unsigned getRegBitWidth(unsigned RCID);
/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
-/// Get size of register operand
-unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
- unsigned OpNo);
-
LLVM_READNONE
inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
switch (OpInfo.OperandType) {
@@ -1780,13 +1783,15 @@ inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) {
}
/// \returns true if an instruction may have a 64-bit VGPR operand.
-bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
+bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc,
+ const MCSubtargetInfo &ST);
/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands.
bool isDPALU_DPP32BitOpc(unsigned Opc);
/// \returns true if an instruction is a DP ALU DPP.
-bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST);
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
+ const MCSubtargetInfo &ST);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 30dab55..d87d250 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -405,7 +405,7 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
field dag Ins32 = !if(!eq(vt.Size, 32),
(ins VSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
!if(!eq(vt.Size, 64),
- (ins VSrc_f64:$src0, VReg_64:$src1, ImmOpType:$imm),
+ (ins VSrc_f64:$src0, VReg_64_AlignTarget:$src1, ImmOpType:$imm),
(ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm)));
field dag InsVOPDX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm);
let InsVOPDX_immX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immX);
@@ -474,10 +474,10 @@ def VOP_MADMK_F64 : VOP_MADMK <f64>;
// given VT.
class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
RegisterOperand ret =
- !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
- !eq(VT.Size, 96) : RegisterOperand<VReg_96>,
- !eq(VT.Size, 64) : RegisterOperand<VReg_64>,
- !eq(VT.Size, 48) : RegisterOperand<VReg_64>,
+ !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>,
+ !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>,
+ !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>,
+ !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>,
!eq(VT.Size, 16) : !if(IsTrue16,
!if(IsFake16, RegisterOperand<VGPR_32>,
RegisterOperand<VGPR_16>),
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 4a2b54d..42ec8ba 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -97,6 +97,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
let mayRaiseFPException = 0;
+ let VOP3_OPSEL = P.HasOpSel;
}
def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
@@ -119,16 +120,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let HasSrc0Mods = 0;
}
-class getInterp16Asm <bit HasSrc2, bit HasOMod> {
+class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> {
string src2 = !if(HasSrc2, ", $src2_modifiers", "");
string omod = !if(HasOMod, "$omod", "");
+ string opsel = !if(OpSel, "$op_sel", "");
string ret =
- " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod;
+ " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel;
}
class getInterp16Ins <bit HasSrc2, bit HasOMod,
- Operand Src0Mod, Operand Src2Mod> {
- dag ret = !if(HasSrc2,
+ Operand Src0Mod, Operand Src2Mod, bit OpSel> {
+ dag ret1 = !if(HasSrc2,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
InterpAttr:$attr, InterpAttrChan:$attrchan,
@@ -143,19 +145,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
InterpAttr:$attr, InterpAttrChan:$attrchan,
highmod:$high, Clamp0:$clamp, omod0:$omod)
);
+ dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins));
+ dag ret = !con(ret1, ret2);
}
-class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
let IsSingle = 1;
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
+ let HasOpSel = OpSel;
let Src0Mod = FPVRegInputMods;
let Src2Mod = FPVRegInputMods;
let Outs64 = (outs DstRC.RegClass:$vdst);
- let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
- let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
+ let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret;
+ let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret;
}
//===----------------------------------------------------------------------===//
@@ -480,7 +485,7 @@ let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
let OtherPredicates = [isNotGFX90APlus] in
-def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>;
} // End SubtargetPredicate = isGFX9Plus
// This predicate should only apply to the selection pattern. The
@@ -2676,6 +2681,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
}
+multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> {
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
@@ -2788,7 +2801,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
-defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
+defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">;
defm V_ADD_I32 : VOP3_Real_vi <0x29c>;
defm V_SUB_I32 : VOP3_Real_vi <0x29d>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5daf860..7cfd059 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
: VOP3P_Mix_Profile<P, Features, 0> {
let IsTrue16 = 1;
- let IsRealTrue16 = 1;
+ let IsRealTrue16 = 1;
let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
}
@@ -705,16 +705,16 @@ foreach Type = ["U", "I"] in
(!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
def ADst_32 : VOPDstOperand<AGPR_32>;
-def ADst_64 : VOPDstOperand<AReg_64>;
-def ADst_128 : VOPDstOperand<AReg_128>;
-def ADst_256 : VOPDstOperand<AReg_256>;
-def ADst_512 : VOPDstOperand<AReg_512>;
-def ADst_1024 : VOPDstOperand<AReg_1024>;
-def VDst_64 : VOPDstOperand<VReg_64>;
-def VDst_128 : VOPDstOperand<VReg_128>;
-def VDst_256 : VOPDstOperand<VReg_256>;
-def VDst_512 : VOPDstOperand<VReg_512>;
-def VDst_1024 : VOPDstOperand<VReg_1024>;
+def ADst_64 : VOPDstOperand<AReg_64_AlignTarget>;
+def ADst_128 : VOPDstOperand<AReg_128_AlignTarget>;
+def ADst_256 : VOPDstOperand<AReg_256_AlignTarget>;
+def ADst_512 : VOPDstOperand<AReg_512_AlignTarget>;
+def ADst_1024 : VOPDstOperand<AReg_1024_AlignTarget>;
+def VDst_64 : VOPDstOperand<VReg_64_AlignTarget>;
+def VDst_128 : VOPDstOperand<VReg_128_AlignTarget>;
+def VDst_256 : VOPDstOperand<VReg_256_AlignTarget>;
+def VDst_512 : VOPDstOperand<VReg_512_AlignTarget>;
+def VDst_1024 : VOPDstOperand<VReg_1024_AlignTarget>;
def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
let Src0RC64 = ARegSrc_32;
@@ -811,23 +811,23 @@ def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F
def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
-def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
-def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>;
-
-def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>;
-def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>;
+def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>;
+def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+
+def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
+def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>;
def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;
@@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
}
// Currently assumes scaled instructions never have abid
-class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
+class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag <
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
node:$src0_modifiers, node:$scale_src0,
node:$src1_modifiers, node:$scale_src1),
@@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled =
(ops node:$blgp))),
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
- (Op $src0, $src1, $src2, $cbsz, $blgp))),
- pred
->;
-
-defvar MayNeedAGPRs = [{
- return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
-}];
-
-defvar MayNeedAGPRs_gisel = [{
- return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
-}];
+ (Op $src0, $src1, $src2, $cbsz, $blgp)))>;
-defvar MayNotNeedAGPRs = [{
- return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
-}];
+class CanUseAGPR_MAI<ValueType vt> {
+ code PredicateCode = [{
+ return !Subtarget->hasGFX90AInsts() ||
+ (!SIMachineFunctionInfo::MFMAVGPRForm &&
+ MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ }] # !srl(vt.Size, 5) # ");";
-defvar MayNotNeedAGPRs_gisel = [{
- return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
-}];
+ code GISelPredicateCode = [{
+ return !Subtarget->hasGFX90AInsts() ||
+ (!SIMachineFunctionInfo::MFMAVGPRForm &&
+ MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ }] # !srl(vt.Size, 5) # ");";
+}
-class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
+class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true,
bit Scaled = false> :
- MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> {
- let GISelPredicateCode = MayNeedAGPRs_gisel;
-}
+ MAIFrag<Op, HasAbid, Scaled>,
+ CanUseAGPR_MAI<vt>;
class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
- bit Scaled = false> :
- MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> {
- let GISelPredicateCode = MayNotNeedAGPRs_gisel;
-}
+ bit Scaled = false> :
+ MAIFrag<Op, HasAbid, Scaled>;
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
@@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
bit HasAbid = true,
bit Scaled = false> {
defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap;
+ defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P);
+ defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD");
+
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
- def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
- !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
+ def _e64 : MAIInst<OpName, ProfileAGPR,
+ !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>,
MFMATable<0, "AGPR", NAME # "_e64">;
let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in
- def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+ def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR,
!if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">;
}
@@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = OpName in {
- def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
- !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
+ def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR,
+ !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>,
MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">;
let OtherPredicates = [isGFX90APlus] in
- def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
+ def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR,
!if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">;
}
@@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper
defvar UnscaledOpName = UnscaledOpName_#VariantSuffix;
defvar HasAbid = false;
-
- defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap;
+ defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl);
+ defvar NoDstOverlap = Profile.NoDstOverlap;
def _e64 : ScaledMAIInst<OpName,
- !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>,
+ !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>,
MFMATable<0, "AGPR", NAME # "_e64">;
def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd",
@@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = UnscaledOpName_ in {
def _mac_e64 : ScaledMAIInst<OpName # "_mac",
- !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>,
+ !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>,
MFMATable<1, "AGPR", NAME # "_e64">;
def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd",
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 631f0f3..8325c62 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -419,6 +419,13 @@ class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op,
let Inst{14-11} = scale_sel;
}
+class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> {
+ let Inst{11} = src0_modifiers{2};
+ // There's no src1
+ let Inst{13} = src2_modifiers{2};
+ let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 2e8a676..ce1cdb3 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -232,6 +232,7 @@ getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, ARM::SP);
markSuperRegs(Reserved, ARM::PC);
markSuperRegs(Reserved, ARM::FPSCR);
+ markSuperRegs(Reserved, ARM::FPSCR_RM);
markSuperRegs(Reserved, ARM::APSR_NZCV);
if (TFI->isFPReserved(MF))
markSuperRegs(Reserved, STI.getFramePointerReg());
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f4ac6bb..2a40fb9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::LRINT, MVT::f16, Expand);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 31650e0..6771106 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -435,14 +435,14 @@ def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">;
// FP Binary Operations.
//
-let TwoOperandAliasConstraint = "$Dn = $Dd" in
+let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VADDD : ADbI<0b11100, 0b11, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
[(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPALU64]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VADDS : ASbIn<0b11100, 0b11, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
@@ -453,21 +453,21 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0,
let D = VFPNeonA8Domain;
}
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VADDH : AHbI<0b11100, 0b11, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
[(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
-let TwoOperandAliasConstraint = "$Dn = $Dd" in
+let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSUBD : ADbI<0b11100, 0b11, 1, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
[(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPALU64]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
@@ -478,42 +478,42 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
let D = VFPNeonA8Domain;
}
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSUBH : AHbI<0b11100, 0b11, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
[(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
-let TwoOperandAliasConstraint = "$Dn = $Dd" in
+let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVD : ADbI<0b11101, 0b00, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
[(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPDIV64]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVS : ASbI<0b11101, 0b00, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
[(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>,
Sched<[WriteFPDIV32]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
[(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPDIV32]>;
-let TwoOperandAliasConstraint = "$Dn = $Dd" in
+let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMULD : ADbI<0b11100, 0b10, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
[(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMULS : ASbIn<0b11100, 0b10, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
@@ -524,21 +524,21 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0,
let D = VFPNeonA8Domain;
}
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMULH : AHbI<0b11100, 0b10, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
[(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
-let TwoOperandAliasConstraint = "$Dn = $Dd" in
+let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
[(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMULS : ASbI<0b11100, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
@@ -549,7 +549,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
let D = VFPNeonA8Domain;
}
-let TwoOperandAliasConstraint = "$Sn = $Sd" in
+let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
@@ -589,7 +589,7 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>;
multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
- isUnpredicable = 1 in {
+ isUnpredicable = 1, mayRaiseFPException = 1 in {
def H : AHbInp<0b11101, 0b00, opc,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
@@ -621,7 +621,7 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b),
(VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
// These are encoded as unary instructions.
-let Defs = [FPSCR_NZCV] in {
+let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in {
def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", "",
@@ -684,7 +684,7 @@ def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
[(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>;
-let Defs = [FPSCR_NZCV] in {
+let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in {
def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins DPR:$Dd),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", "",
@@ -742,6 +742,7 @@ def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
}
} // Defs = [FPSCR_NZCV]
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "",
@@ -762,6 +763,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
}
// Special case encoding: bits 11-8 is 0b1011.
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "",
[(set SPR:$Sd, (fpround DPR:$Dm))]>,
@@ -787,7 +789,7 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
}
// Between half, single and double-precision.
-let hasSideEffects = 0 in
+let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", "",
[/* Intentionally left blank, see patterns below */]>,
@@ -799,7 +801,7 @@ def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))),
def : FP16Pat<(f16_to_fp GPR:$a),
(VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-let hasSideEffects = 0 in
+let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda",
[/* Intentionally left blank, see patterns below */]>,
@@ -821,7 +823,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm
SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
-let hasSideEffects = 0 in
+let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", "",
[/* Intentionally left blank, see patterns below */]>,
@@ -835,7 +837,7 @@ def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
(v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
(SSubReg_f16_reg imm_odd:$lane)))>;
-let hasSideEffects = 0 in
+let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda",
[/* Intentionally left blank, see patterns below */]>,
@@ -853,6 +855,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm
SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", "",
@@ -876,6 +879,7 @@ def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
(VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>,
Requires<[HasFPARMv8, HasDPVFP]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm),
NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda",
@@ -901,6 +905,7 @@ def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
(i32 (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$a), GPR))>,
Requires<[HasFPARMv8, HasDPVFP]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm", "",
@@ -915,6 +920,7 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
let hasSideEffects = 0;
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
(outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm),
NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda",
@@ -934,7 +940,8 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
multiclass vcvt_inst<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
- let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in {
+ let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0,
+ mayRaiseFPException = 1 in {
def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
(outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
@@ -1055,7 +1062,9 @@ def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
[(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>;
-multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
+multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node,
+ list<Register> uses = [], bit fpexc = 0> {
+ let Uses = uses, mayRaiseFPException = fpexc in {
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
@@ -1081,6 +1090,7 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
let Inst{7} = op2;
let Inst{16} = op;
}
+ }
def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"),
(!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
@@ -1093,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
Requires<[HasFPARMv8,HasDPVFP]>;
}
-defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>;
-defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>;
-defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>;
multiclass vrint_inst_anpm<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
@@ -1140,18 +1150,21 @@ defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>;
defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs DPR:$Dd), (ins DPR:$Dm),
IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "",
[(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
Sched<[WriteFPSQRT64]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "",
[(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
Sched<[WriteFPSQRT32]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
@@ -1486,6 +1499,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let hasSideEffects = 0;
}
+let mayRaiseFPException = 1 in
def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
@@ -1502,6 +1516,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
(VSITOD (VLDRS addrmode5:$a))>;
}
+let mayRaiseFPException = 1 in
def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
(outs SPR:$Sd),(ins SPR:$Sm),
IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
@@ -1520,6 +1535,7 @@ def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOS (VLDRS addrmode5:$a))>;
+let mayRaiseFPException = 1 in
def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
(outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
@@ -1532,6 +1548,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
(VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
+let mayRaiseFPException = 1 in
def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@@ -1548,6 +1565,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
(VUITOD (VLDRS addrmode5:$a))>;
}
+let mayRaiseFPException = 1 in
def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
@@ -1566,6 +1584,7 @@ def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOS (VLDRS addrmode5:$a))>;
+let mayRaiseFPException = 1 in
def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
(outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
@@ -1640,6 +1659,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
}
// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+let mayRaiseFPException = 1 in
def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
@@ -1660,6 +1680,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
(VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
}
+let mayRaiseFPException = 1 in
def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
@@ -1684,6 +1705,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)),
addrmode5:$ptr),
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
+let mayRaiseFPException = 1 in
def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
(outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
@@ -1698,6 +1720,7 @@ def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)),
(COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
+let mayRaiseFPException = 1 in
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@@ -1718,6 +1741,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
(VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
}
+let mayRaiseFPException = 1 in
def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
@@ -1742,6 +1766,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)),
addrmode5:$ptr),
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
+let mayRaiseFPException = 1 in
def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
(outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
@@ -1757,7 +1782,7 @@ def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)),
(COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
-let Uses = [FPSCR] in {
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in {
def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
@@ -1807,7 +1832,7 @@ def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
let Inst{7} = 0; // Z bit
let isUnpredicable = 1;
}
-}
+} // mayRaiseFPException = 1, Uses = [FPSCR_RM]
// v8.3-a Javascript Convert to Signed fixed-point
def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011,
@@ -1825,7 +1850,7 @@ def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011,
// S32 (U=0, sx=1) -> SL
// U32 (U=1, sx=1) -> UL
-let Constraints = "$a = $dst" in {
+let Constraints = "$a = $dst", mayRaiseFPException = 1 in {
// FP to Fixed-Point:
@@ -2026,9 +2051,10 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>,
Sched<[WriteFPCVT]>;
-} // End of 'let Constraints = "$a = $dst" in'
+} // End of 'let Constraints = "$a = $dst", mayRaiseFPException = 1 in'
// BFloat16 - Single precision, unary, predicated
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
class BF16_VCVT<string opc, bits<2> op7_6>
: VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm),
VFPUnaryFrm, NoItinerary,
@@ -2063,6 +2089,7 @@ def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>;
// FP Multiply-Accumulate Operations.
//
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLAD : ADbI<0b11100, 0b00, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
@@ -2072,6 +2099,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
@@ -2085,6 +2113,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
let D = VFPNeonA8Domain;
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLAH : AHbI<0b11100, 0b00, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
@@ -2104,6 +2133,7 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
@@ -2113,6 +2143,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
@@ -2126,6 +2157,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
let D = VFPNeonA8Domain;
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMLSH : AHbI<0b11100, 0b00, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
@@ -2144,6 +2176,7 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
(VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
@@ -2153,6 +2186,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
@@ -2166,6 +2200,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
let D = VFPNeonA8Domain;
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
@@ -2196,6 +2231,7 @@ def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)),
(VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
@@ -2205,6 +2241,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
@@ -2217,6 +2254,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
let D = VFPNeonA8Domain;
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
@@ -2237,6 +2275,7 @@ def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin),
//===----------------------------------------------------------------------===//
// Fused FP Multiply-Accumulate Operations.
//
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMAD : ADbI<0b11101, 0b10, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm",
@@ -2246,6 +2285,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm",
@@ -2258,6 +2298,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
// VFP pipelines.
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMAH : AHbI<0b11101, 0b10, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
@@ -2289,6 +2330,7 @@ def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
(VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMSD : ADbI<0b11101, 0b10, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm",
@@ -2298,6 +2340,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
@@ -2310,6 +2353,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
// VFP pipelines.
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFMSH : AHbI<0b11101, 0b10, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
@@ -2341,6 +2385,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
(VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm",
@@ -2350,6 +2395,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm",
@@ -2362,6 +2408,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
// VFP pipelines.
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
@@ -2400,6 +2447,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))
(VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm",
@@ -2409,6 +2457,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
@@ -2420,6 +2469,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
// VFP pipelines.
}
+let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 5a31b88..de42195 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -177,8 +177,9 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
}
// Current Program Status Register.
-// We model fpscr with two registers: FPSCR models the control bits and will be
-// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// We model fpscr with three registers. FPSCR models the control bits and will be
+// reserved. FPSCR_RM models rounding mode control bits and will be reserved.
+// FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
// models the APSR when it's accessed by some special instructions. In such cases
// it has the same encoding as PC.
def CPSR : ARMReg<0, "cpsr">;
@@ -189,6 +190,9 @@ def FPSCR : ARMReg<3, "fpscr">;
def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
let Aliases = [FPSCR];
}
+def FPSCR_RM : ARMReg<3, "fpscr_rm"> {
+ let Aliases = [FPSCR];
+}
def ITSTATE : ARMReg<4, "itstate">;
// Special Registers - only available in privileged mode.
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index b202b202..e3c39a1 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsBPF.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@@ -478,9 +479,95 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
}
}
+static Value *wrapPtrIfASNotZero(DenseMap<Value *, Value *> &Cache,
+ CallInst *CI, Value *P) {
+ if (auto *PTy = dyn_cast<PointerType>(P->getType())) {
+ if (PTy->getAddressSpace() == 0)
+ return P;
+ }
+ return aspaceWrapValue(Cache, CI->getFunction(), P);
+}
+
+static Instruction *aspaceMemSet(Intrinsic::ID ID,
+ DenseMap<Value *, Value *> &Cache,
+ CallInst *CI) {
+ auto *MI = cast<MemIntrinsic>(CI);
+ IRBuilder<> B(CI);
+
+ Value *OldDst = CI->getArgOperand(0);
+ Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
+ if (OldDst == NewDst)
+ return nullptr;
+
+ // memset(new_dst, val, len, align, isvolatile, md)
+ Value *Val = CI->getArgOperand(1);
+ Value *Len = CI->getArgOperand(2);
+
+ auto *MS = cast<MemSetInst>(CI);
+ MaybeAlign Align = MS->getDestAlign();
+ bool IsVolatile = MS->isVolatile();
+
+ if (ID == Intrinsic::memset)
+ return B.CreateMemSet(NewDst, Val, Len, Align, IsVolatile,
+ MI->getAAMetadata());
+ else
+ return B.CreateMemSetInline(NewDst, Align, Val, Len, IsVolatile,
+ MI->getAAMetadata());
+}
+
+static Instruction *aspaceMemCpy(Intrinsic::ID ID,
+ DenseMap<Value *, Value *> &Cache,
+ CallInst *CI) {
+ auto *MI = cast<MemIntrinsic>(CI);
+ IRBuilder<> B(CI);
+
+ Value *OldDst = CI->getArgOperand(0);
+ Value *OldSrc = CI->getArgOperand(1);
+ Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
+ Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc);
+ if (OldDst == NewDst && OldSrc == NewSrc)
+ return nullptr;
+
+ // memcpy(new_dst, dst_align, new_src, src_align, len, isvolatile, md)
+ Value *Len = CI->getArgOperand(2);
+
+ auto *MT = cast<MemTransferInst>(CI);
+ MaybeAlign DstAlign = MT->getDestAlign();
+ MaybeAlign SrcAlign = MT->getSourceAlign();
+ bool IsVolatile = MT->isVolatile();
+
+ return B.CreateMemTransferInst(ID, NewDst, DstAlign, NewSrc, SrcAlign, Len,
+ IsVolatile, MI->getAAMetadata());
+}
+
+static Instruction *aspaceMemMove(DenseMap<Value *, Value *> &Cache,
+ CallInst *CI) {
+ auto *MI = cast<MemIntrinsic>(CI);
+ IRBuilder<> B(CI);
+
+ Value *OldDst = CI->getArgOperand(0);
+ Value *OldSrc = CI->getArgOperand(1);
+ Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
+ Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc);
+ if (OldDst == NewDst && OldSrc == NewSrc)
+ return nullptr;
+
+ // memmove(new_dst, dst_align, new_src, src_align, len, isvolatile, md)
+ Value *Len = CI->getArgOperand(2);
+
+ auto *MT = cast<MemTransferInst>(CI);
+ MaybeAlign DstAlign = MT->getDestAlign();
+ MaybeAlign SrcAlign = MT->getSourceAlign();
+ bool IsVolatile = MT->isVolatile();
+
+ return B.CreateMemMove(NewDst, DstAlign, NewSrc, SrcAlign, Len, IsVolatile,
+ MI->getAAMetadata());
+}
+
// Support for BPF address spaces:
// - for each function in the module M, update pointer operand of
// each memory access instruction (load/store/cmpxchg/atomicrmw)
+// or intrinsic call insns (memset/memcpy/memmove)
// by casting it from non-zero address space to zero address space, e.g:
//
// (load (ptr addrspace (N) %p) ...)
@@ -493,21 +580,60 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
for (Function &F : M) {
DenseMap<Value *, Value *> CastsCache;
for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
+ for (Instruction &I : llvm::make_early_inc_range(BB)) {
unsigned PtrOpNum;
- if (auto *LD = dyn_cast<LoadInst>(&I))
+ if (auto *LD = dyn_cast<LoadInst>(&I)) {
PtrOpNum = LD->getPointerOperandIndex();
- else if (auto *ST = dyn_cast<StoreInst>(&I))
+ aspaceWrapOperand(CastsCache, &I, PtrOpNum);
+ continue;
+ }
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
PtrOpNum = ST->getPointerOperandIndex();
- else if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I))
+ aspaceWrapOperand(CastsCache, &I, PtrOpNum);
+ continue;
+ }
+ if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) {
PtrOpNum = CmpXchg->getPointerOperandIndex();
- else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+ aspaceWrapOperand(CastsCache, &I, PtrOpNum);
+ continue;
+ }
+ if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
PtrOpNum = RMW->getPointerOperandIndex();
+ aspaceWrapOperand(CastsCache, &I, PtrOpNum);
+ continue;
+ }
+
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (!CI)
+ continue;
+
+ Function *Callee = CI->getCalledFunction();
+ if (!Callee || !Callee->isIntrinsic())
+ continue;
+
+ // Check memset/memcpy/memmove
+ Intrinsic::ID ID = Callee->getIntrinsicID();
+ bool IsSet = ID == Intrinsic::memset || ID == Intrinsic::memset_inline;
+ bool IsCpy = ID == Intrinsic::memcpy || ID == Intrinsic::memcpy_inline;
+ bool IsMove = ID == Intrinsic::memmove;
+ if (!IsSet && !IsCpy && !IsMove)
+ continue;
+
+ Instruction *New;
+ if (IsSet)
+ New = aspaceMemSet(ID, CastsCache, CI);
+ else if (IsCpy)
+ New = aspaceMemCpy(ID, CastsCache, CI);
else
+ New = aspaceMemMove(CastsCache, CI);
+
+ if (!New)
continue;
- aspaceWrapOperand(CastsCache, &I, PtrOpNum);
+ I.replaceAllUsesWith(New);
+ New->takeName(&I);
+ I.eraseFromParent();
}
}
Changed |= !CastsCache.empty();
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 34026ed..ecfb5fe 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -439,18 +439,6 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
MF.getFunction().getContext());
- const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
-
- std::optional<unsigned> FirstMaskArgument = std::nullopt;
- // Preassign the first mask argument.
- if (Subtarget.hasVInstructions()) {
- for (const auto &ArgIdx : enumerate(Outs)) {
- MVT ArgVT = MVT::getVT(ArgIdx.value().Ty);
- if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1)
- FirstMaskArgument = ArgIdx.index();
- }
- }
-
for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
MVT VT = MVT::getVT(Outs[I].Ty);
if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 71c21e4..53633ea 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -675,6 +675,45 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC,
CC = getRISCVCCFromICmp(Pred);
}
+/// Select the RISC-V Zalasr opcode for the G_LOAD or G_STORE operation
+/// \p GenericOpc, appropriate for the GPR register bank and of memory access
+/// size \p OpSize.
+static unsigned selectZalasrLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
+ const bool IsStore = GenericOpc == TargetOpcode::G_STORE;
+ switch (OpSize) {
+ default:
+ llvm_unreachable("Unexpected memory size");
+ case 8:
+ return IsStore ? RISCV::SB_RL : RISCV::LB_AQ;
+ case 16:
+ return IsStore ? RISCV::SH_RL : RISCV::LH_AQ;
+ case 32:
+ return IsStore ? RISCV::SW_RL : RISCV::LW_AQ;
+ case 64:
+ return IsStore ? RISCV::SD_RL : RISCV::LD_AQ;
+ }
+}
+
+/// Select the RISC-V regimm opcode for the G_LOAD or G_STORE operation
+/// \p GenericOpc, appropriate for the GPR register bank and of memory access
+/// size \p OpSize. \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
+ const bool IsStore = GenericOpc == TargetOpcode::G_STORE;
+ switch (OpSize) {
+ case 8:
+ // Prefer unsigned due to no c.lb in Zcb.
+ return IsStore ? RISCV::SB : RISCV::LBU;
+ case 16:
+ return IsStore ? RISCV::SH : RISCV::LH;
+ case 32:
+ return IsStore ? RISCV::SW : RISCV::LW;
+ case 64:
+ return IsStore ? RISCV::SD : RISCV::LD;
+ }
+
+ return GenericOpc;
+}
+
bool RISCVInstructionSelector::select(MachineInstr &MI) {
MachineIRBuilder MIB(MI);
@@ -736,6 +775,62 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
MI.eraseFromParent();
return true;
}
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_SEXT: {
+ bool IsSigned = Opc != TargetOpcode::G_ZEXT;
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT SrcTy = MRI->getType(SrcReg);
+ unsigned SrcSize = SrcTy.getSizeInBits();
+
+ if (SrcTy.isVector())
+ return false; // Should be handled by imported patterns.
+
+ assert((*RBI.getRegBank(DstReg, *MRI, TRI)).getID() ==
+ RISCV::GPRBRegBankID &&
+ "Unexpected ext regbank");
+
+ // Use addiw SrcReg, 0 (sext.w) for i32.
+ if (IsSigned && SrcSize == 32) {
+ MI.setDesc(TII.get(RISCV::ADDIW));
+ MI.addOperand(MachineOperand::CreateImm(0));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ }
+
+ // Use add.uw SrcReg, X0 (zext.w) for i32 with Zba.
+ if (!IsSigned && SrcSize == 32 && STI.hasStdExtZba()) {
+ MI.setDesc(TII.get(RISCV::ADD_UW));
+ MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ }
+
+ // Use sext.h/zext.h for i16 with Zbb.
+ if (SrcSize == 16 && STI.hasStdExtZbb()) {
+ MI.setDesc(TII.get(IsSigned ? RISCV::SEXT_H
+ : STI.isRV64() ? RISCV::ZEXT_H_RV64
+ : RISCV::ZEXT_H_RV32));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ }
+
+ // Use pack(w) SrcReg, X0 for i16 zext with Zbkb.
+ if (!IsSigned && SrcSize == 16 && STI.hasStdExtZbkb()) {
+ MI.setDesc(TII.get(STI.is64Bit() ? RISCV::PACKW : RISCV::PACK));
+ MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ }
+
+ // Fall back to shift pair.
+ auto ShiftLeft =
+ MIB.buildInstr(RISCV::SLLI, {&RISCV::GPRRegClass}, {SrcReg})
+ .addImm(STI.getXLen() - SrcSize);
+ constrainSelectedInstRegOperands(*ShiftLeft, TII, TRI, RBI);
+ auto ShiftRight = MIB.buildInstr(IsSigned ? RISCV::SRAI : RISCV::SRLI,
+ {DstReg}, {ShiftLeft})
+ .addImm(STI.getXLen() - SrcSize);
+ constrainSelectedInstRegOperands(*ShiftRight, TII, TRI, RBI);
+ MI.eraseFromParent();
+ return true;
+ }
case TargetOpcode::G_FCONSTANT: {
// TODO: Use constant pool for complex constants.
Register DstReg = MI.getOperand(0).getReg();
@@ -836,6 +931,59 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
return selectImplicitDef(MI, MIB);
case TargetOpcode::G_UNMERGE_VALUES:
return selectUnmergeValues(MI, MIB);
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE: {
+ GLoadStore &LdSt = cast<GLoadStore>(MI);
+ const Register ValReg = LdSt.getReg(0);
+ const Register PtrReg = LdSt.getPointerReg();
+ LLT PtrTy = MRI->getType(PtrReg);
+
+ const RegisterBank &RB = *RBI.getRegBank(ValReg, *MRI, TRI);
+ if (RB.getID() != RISCV::GPRBRegBankID)
+ return false;
+
+#ifndef NDEBUG
+ const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, *MRI, TRI);
+ // Check that the pointer register is valid.
+ assert(PtrRB.getID() == RISCV::GPRBRegBankID &&
+ "Load/Store pointer operand isn't a GPR");
+ assert(PtrTy.isPointer() && "Load/Store pointer operand isn't a pointer");
+#endif
+
+ // Can only handle AddressSpace 0.
+ if (PtrTy.getAddressSpace() != 0)
+ return false;
+
+ unsigned MemSize = LdSt.getMemSizeInBits().getValue();
+ AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
+
+ if (isStrongerThanMonotonic(Order)) {
+ MI.setDesc(TII.get(selectZalasrLoadStoreOp(Opc, MemSize)));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+ }
+
+ const unsigned NewOpc = selectRegImmLoadStoreOp(MI.getOpcode(), MemSize);
+ if (NewOpc == MI.getOpcode())
+ return false;
+
+ // Check if we can fold anything into the addressing mode.
+ auto AddrModeFns = selectAddrRegImm(MI.getOperand(1));
+ if (!AddrModeFns)
+ return false;
+
+ // Folded something. Create a new instruction and return it.
+ auto NewInst = MIB.buildInstr(NewOpc, {}, {}, MI.getFlags());
+ if (isa<GStore>(MI))
+ NewInst.addUse(ValReg);
+ else
+ NewInst.addDef(ValReg);
+ NewInst.cloneMemRefs(MI);
+ for (auto &Fn : *AddrModeFns)
+ Fn(NewInst);
+ MI.eraseFromParent();
+
+ return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI);
+ }
default:
return false;
}
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index a537904..5dd4bf4 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -166,7 +166,7 @@ static unsigned getLRForRMW32(AtomicOrdering Ordering,
return RISCV::LR_W;
return RISCV::LR_W_AQ;
case AtomicOrdering::SequentiallyConsistent:
- return RISCV::LR_W_AQ_RL;
+ return RISCV::LR_W_AQRL;
}
}
@@ -210,7 +210,7 @@ static unsigned getLRForRMW64(AtomicOrdering Ordering,
return RISCV::LR_D;
return RISCV::LR_D_AQ;
case AtomicOrdering::SequentiallyConsistent:
- return RISCV::LR_D_AQ_RL;
+ return RISCV::LR_D_AQRL;
}
}
@@ -287,8 +287,8 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
break;
}
BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg)
- .addReg(AddrReg)
- .addReg(ScratchReg);
+ .addReg(ScratchReg)
+ .addReg(AddrReg);
BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
.addReg(ScratchReg)
.addReg(RISCV::X0)
@@ -375,8 +375,8 @@ static void doMaskedAtomicBinOpExpansion(const RISCVInstrInfo *TII,
ScratchReg);
BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), ScratchReg)
- .addReg(AddrReg)
- .addReg(ScratchReg);
+ .addReg(ScratchReg)
+ .addReg(AddrReg);
BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
.addReg(ScratchReg)
.addReg(RISCV::X0)
@@ -535,8 +535,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
// sc.w scratch1, scratch1, (addr)
// bnez scratch1, loop
BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), Scratch1Reg)
- .addReg(AddrReg)
- .addReg(Scratch1Reg);
+ .addReg(Scratch1Reg)
+ .addReg(AddrReg);
BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
.addReg(Scratch1Reg)
.addReg(RISCV::X0)
@@ -674,8 +674,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg(
// bnez scratch, loophead
BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
ScratchReg)
- .addReg(AddrReg)
- .addReg(NewValReg);
+ .addReg(NewValReg)
+ .addReg(AddrReg);
BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
.addReg(ScratchReg)
.addReg(RISCV::X0)
@@ -707,8 +707,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg(
MaskReg, ScratchReg);
BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
ScratchReg)
- .addReg(AddrReg)
- .addReg(ScratchReg);
+ .addReg(ScratchReg)
+ .addReg(AddrReg);
BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
.addReg(ScratchReg)
.addReg(RISCV::X0)
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a02de31..40c05e8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -265,7 +265,7 @@ def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">;
def FeatureStdExtZalasr
- : RISCVExperimentalExtension<0, 1, "Load-Acquire and Store-Release Instructions">;
+ : RISCVExperimentalExtension<0, 9, "Load-Acquire and Store-Release Instructions">;
def HasStdExtZalasr : Predicate<"Subtarget->hasStdExtZalasr()">,
AssemblerPredicate<(all_of FeatureStdExtZalasr),
"'Zalasr' (Load-Acquire and Store-Release Instructions)">;
@@ -1421,7 +1421,7 @@ def HasVendorXMIPSCMov
: Predicate<"Subtarget->hasVendorXMIPSCMov()">,
AssemblerPredicate<(all_of FeatureVendorXMIPSCMov),
"'Xmipscmov' ('mips.ccmov' instruction)">;
-def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">;
+def UseMIPSCCMovInsn : Predicate<"Subtarget->useMIPSCCMovInsn()">;
def FeatureVendorXMIPSLSP
: RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">;
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 7f5d0af..eba35ef 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -100,93 +100,11 @@ def : LdPat<load, LD, PtrVT>;
def : StPat<store, SD, GPR, PtrVT>;
}
-// Load and store patterns for i16, needed because Zfh makes s16 load/store
-// legal and regbank select may not constrain registers to FP.
-def : LdPat<load, LH, i16>;
-def : StPat<store, SH, GPR, i16>;
-
-def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb.
-def : StPat<truncstorei8, SB, GPR, i16>;
-
-let Predicates = [HasAtomicLdSt] in {
- // Prefer unsigned due to no c.lb in Zcb.
- def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>;
- def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>;
-
- def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>;
- def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>;
-}
-
-let Predicates = [HasAtomicLdSt, IsRV64] in {
- // Load pattern is in RISCVInstrInfoA.td and shared with RV32.
- def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>;
-}
-
//===----------------------------------------------------------------------===//
// RV64 i32 patterns not used by SelectionDAG
//===----------------------------------------------------------------------===//
let Predicates = [IsRV64] in {
-def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb.
-def : LdPat<extloadi16, LH, i32>;
-
-def : StPat<truncstorei8, SB, GPR, i32>;
-def : StPat<truncstorei16, SH, GPR, i32>;
-
-def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>;
-
def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32),
(ADDIW GPR:$rs1, simm12_lo:$imm)>;
}
-
-let Predicates = [IsRV64, NoStdExtZba] in
-def : Pat<(zext (i32 GPR:$src)), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>;
-
-let Predicates = [IsRV32, NoStdExtZbb, NoStdExtZbkb] in
-def : Pat<(XLenVT (zext (i16 GPR:$src))),
- (SRLI (XLenVT (SLLI GPR:$src, 16)), 16)>;
-
-let Predicates = [IsRV64, NoStdExtZbb, NoStdExtZbkb] in {
-def : Pat<(i64 (zext (i16 GPR:$src))),
- (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>;
-def : Pat<(i32 (zext (i16 GPR:$src))),
- (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>;
-}
-
-let Predicates = [IsRV32, NoStdExtZbb] in
-def : Pat<(XLenVT (sext (i16 GPR:$src))),
- (SRAI (XLenVT (SLLI GPR:$src, 16)), 16)>;
-
-let Predicates = [IsRV64, NoStdExtZbb] in {
-def : Pat<(i64 (sext (i16 GPR:$src))),
- (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>;
-def : Pat<(i32 (sext (i16 GPR:$src))),
- (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>;
-}
-
-//===----------------------------------------------------------------------===//
-// Zb* RV64 patterns not used by SelectionDAG.
-//===----------------------------------------------------------------------===//
-
-let Predicates = [HasStdExtZba, IsRV64] in {
-def : Pat<(zext (i32 GPR:$src)), (ADD_UW GPR:$src, (XLenVT X0))>;
-}
-
-let Predicates = [HasStdExtZbb] in
-def : Pat<(i32 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>;
-let Predicates = [HasStdExtZbb, IsRV64] in
-def : Pat<(i64 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>;
-
-let Predicates = [HasStdExtZbb, IsRV32] in
-def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV32 GPR:$rs)>;
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(i64 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>;
-def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>;
-}
-
-let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32] in
-def : Pat<(i32 (zext (i16 GPR:$rs))), (PACK GPR:$rs, (XLenVT X0))>;
-let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in {
-def : Pat<(i64 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>;
-def : Pat<(i32 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>;
-}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dcce2d2..7123a2d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -434,7 +434,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ABS, MVT::i32, Custom);
}
- if (!Subtarget.useCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov())
+ if (!Subtarget.useMIPSCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov())
setOperationAction(ISD::SELECT, XLenVT, Custom);
if (Subtarget.hasVendorXqcia() && !Subtarget.is64Bit()) {
@@ -15721,8 +15721,7 @@ static SDValue combineAddOfBooleanXor(SDNode *N, SelectionDAG &DAG) {
return SDValue();
// Emit a negate of the setcc.
- return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- N0.getOperand(0));
+ return DAG.getNegative(N0.getOperand(0), DL, VT);
}
static SDValue performADDCombine(SDNode *N,
@@ -16498,43 +16497,60 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
SDValue X = N->getOperand(0);
if (Subtarget.hasShlAdd(3)) {
- for (uint64_t Divisor : {3, 5, 9}) {
- if (MulAmt % Divisor != 0)
- continue;
- uint64_t MulAmt2 = MulAmt / Divisor;
- // 3/5/9 * 2^N -> shl (shXadd X, X), N
- if (isPowerOf2_64(MulAmt2)) {
- SDLoc DL(N);
- SDValue X = N->getOperand(0);
- // Put the shift first if we can fold a zext into the
- // shift forming a slli.uw.
- if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
- X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt2), DL, VT));
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT),
- Shl);
- }
- // Otherwise, put rhe shl second so that it can fold with following
- // instructions (e.g. sext or add).
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- return DAG.getNode(ISD::SHL, DL, VT, Mul359,
- DAG.getConstant(Log2_64(MulAmt2), DL, VT));
- }
-
- // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
- if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
- SDLoc DL(N);
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT),
- Mul359);
+ int Shift;
+ if (int ShXAmount = isShifted359(MulAmt, Shift)) {
+ // 3/5/9 * 2^N -> shl (shXadd X, X), N
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ // Put the shift first if we can fold a zext into the shift forming
+ // a slli.uw.
+ if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
+ X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
+ SDValue Shl =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
+ DAG.getConstant(ShXAmount, DL, VT), Shl);
}
+ // Otherwise, put the shl second so that it can fold with following
+ // instructions (e.g. sext or add).
+ SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(ShXAmount, DL, VT), X);
+ return DAG.getNode(ISD::SHL, DL, VT, Mul359,
+ DAG.getConstant(Shift, DL, VT));
+ }
+
+ // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
+ int ShX;
+ int ShY;
+ switch (MulAmt) {
+ case 3 * 5:
+ ShY = 1;
+ ShX = 2;
+ break;
+ case 3 * 9:
+ ShY = 1;
+ ShX = 3;
+ break;
+ case 5 * 5:
+ ShX = ShY = 2;
+ break;
+ case 5 * 9:
+ ShY = 2;
+ ShX = 3;
+ break;
+ case 9 * 9:
+ ShX = ShY = 3;
+ break;
+ default:
+ ShX = ShY = 0;
+ break;
+ }
+ if (ShX) {
+ SDLoc DL(N);
+ SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(ShY, DL, VT), X);
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+ DAG.getConstant(ShX, DL, VT), Mul359);
}
// If this is a power 2 + 2/4/8, we can use a shift followed by a single
@@ -16557,18 +16573,14 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// variants we could implement. e.g.
// (2^(1,2,3) * 3,5,9 + 1) << C2
// 2^(C1>3) * 3,5,9 +/- 1
- for (uint64_t Divisor : {3, 5, 9}) {
- uint64_t C = MulAmt - 1;
- if (C <= Divisor)
- continue;
- unsigned TZ = llvm::countr_zero(C);
- if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
+ if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
+ assert(Shift != 0 && "MulAmt=4,6,10 handled before");
+ if (Shift <= 3) {
SDLoc DL(N);
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+ SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(ShXAmount, DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(TZ, DL, VT), X);
+ DAG.getConstant(Shift, DL, VT), X);
}
}
@@ -16576,7 +16588,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
if (ScaleShift >= 1 && ScaleShift < 4) {
- unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
+ unsigned ShiftAmt = llvm::countr_zero((MulAmt - 1) & (MulAmt - 2));
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
@@ -16589,7 +16601,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
- unsigned ShAmt = Log2_64(MulAmt + Offset);
+ unsigned ShAmt = llvm::countr_zero(MulAmt + Offset);
if (ShAmt >= VT.getSizeInBits())
continue;
SDLoc DL(N);
@@ -16608,21 +16620,16 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt2 = MulAmt / Divisor;
// 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
// of 25 which happen to be quite common.
- for (uint64_t Divisor2 : {3, 5, 9}) {
- if (MulAmt2 % Divisor2 != 0)
- continue;
- uint64_t MulAmt3 = MulAmt2 / Divisor2;
- if (isPowerOf2_64(MulAmt3)) {
- SDLoc DL(N);
- SDValue Mul359A =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- SDValue Mul359B = DAG.getNode(
- RISCVISD::SHL_ADD, DL, VT, Mul359A,
- DAG.getConstant(Log2_64(Divisor2 - 1), DL, VT), Mul359A);
- return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
- DAG.getConstant(Log2_64(MulAmt3), DL, VT));
- }
+ if (int ShBAmount = isShifted359(MulAmt2, Shift)) {
+ SDLoc DL(N);
+ SDValue Mul359A =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+ SDValue Mul359B =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A,
+ DAG.getConstant(ShBAmount, DL, VT), Mul359A);
+ return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
+ DAG.getConstant(Shift, DL, VT));
}
}
}
@@ -16966,7 +16973,7 @@ performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc))
if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG())
- return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+ return DAG.getNegative(Src, DL, VT);
// Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1)
if (Opc == ISD::XOR && SrcVT == MVT::i1 &&
@@ -25031,8 +25038,17 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
// Mark RVV intrinsic as supported.
- if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID()))
+ if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) {
+ // GISel doesn't support tuple types yet.
+ if (Inst.getType()->isRISCVVectorTupleTy())
+ return true;
+
+ for (unsigned i = 0; i < II->arg_size(); ++i)
+ if (II->getArgOperand(i)->getType()->isRISCVVectorTupleTy())
+ return true;
+
return false;
+ }
}
if (Inst.getType()->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7db4832..96e1078 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4586,24 +4586,23 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(DestReg, RegState::Kill)
.addImm(ShiftAmount)
.setMIFlag(Flag);
- } else if (STI.hasShlAdd(3) &&
- ((Amount % 3 == 0 && isPowerOf2_64(Amount / 3)) ||
- (Amount % 5 == 0 && isPowerOf2_64(Amount / 5)) ||
- (Amount % 9 == 0 && isPowerOf2_64(Amount / 9)))) {
+ } else if (int ShXAmount, ShiftAmount;
+ STI.hasShlAdd(3) &&
+ (ShXAmount = isShifted359(Amount, ShiftAmount)) != 0) {
// We can use Zba SHXADD+SLLI instructions for multiply in some cases.
unsigned Opc;
- uint32_t ShiftAmount;
- if (Amount % 9 == 0) {
- Opc = RISCV::SH3ADD;
- ShiftAmount = Log2_64(Amount / 9);
- } else if (Amount % 5 == 0) {
- Opc = RISCV::SH2ADD;
- ShiftAmount = Log2_64(Amount / 5);
- } else if (Amount % 3 == 0) {
+ switch (ShXAmount) {
+ case 1:
Opc = RISCV::SH1ADD;
- ShiftAmount = Log2_64(Amount / 3);
- } else {
- llvm_unreachable("implied by if-clause");
+ break;
+ case 2:
+ Opc = RISCV::SH2ADD;
+ break;
+ case 3:
+ Opc = RISCV::SH3ADD;
+ break;
+ default:
+ llvm_unreachable("unexpected result of isShifted359");
}
if (ShiftAmount)
BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 42a0c4c..c5eddb9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -25,6 +25,25 @@
namespace llvm {
+// If Value is of the form C1<<C2, where C1 = 3, 5 or 9,
+// returns log2(C1 - 1) and assigns Shift = C2.
+// Otherwise, returns 0.
+template <typename T> int isShifted359(T Value, int &Shift) {
+ if (Value == 0)
+ return 0;
+ Shift = llvm::countr_zero(Value);
+ switch (Value >> Shift) {
+ case 3:
+ return 1;
+ case 5:
+ return 2;
+ case 9:
+ return 3;
+ default:
+ return 0;
+ }
+}
+
class RISCVSubtarget;
static const MachineMemOperand::Flags MONontemporalBit0 =
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9855c47..7a14929 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1980,7 +1980,7 @@ def : LdPat<sextloadi8, LB>;
def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb.
def : LdPat<sextloadi16, LH>;
def : LdPat<extloadi16, LH>;
-def : LdPat<load, LW, i32>;
+def : LdPat<load, LW, i32>, Requires<[IsRV32]>;
def : LdPat<zextloadi8, LBU>;
def : LdPat<zextloadi16, LHU>;
@@ -1994,7 +1994,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
def : StPat<truncstorei8, SB, GPR, XLenVT>;
def : StPat<truncstorei16, SH, GPR, XLenVT>;
-def : StPat<store, SW, GPR, i32>;
+def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>;
/// Fences
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 25accd9..571d72f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -24,36 +24,36 @@ class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
}
multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> {
- def "" : LR_r<0, 0, funct3, opcodestr>;
- def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">;
- def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">;
- def _AQ_RL : LR_r<1, 1, funct3, opcodestr # ".aqrl">;
+ def "" : LR_r<0, 0, funct3, opcodestr>;
+ def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">;
+ def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">;
+ def _AQRL : LR_r<1, 1, funct3, opcodestr # ".aqrl">;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
class SC_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
: RVInstRAtomic<0b00011, aq, rl, funct3, OPC_AMO,
- (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
+ (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
opcodestr, "$rd, $rs2, $rs1">;
multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> {
- def "" : SC_r<0, 0, funct3, opcodestr>;
- def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">;
- def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">;
- def _AQ_RL : SC_r<1, 1, funct3, opcodestr # ".aqrl">;
+ def "" : SC_r<0, 0, funct3, opcodestr>;
+ def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">;
+ def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">;
+ def _AQRL : SC_r<1, 1, funct3, opcodestr # ".aqrl">;
}
let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr>
: RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
- (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
+ (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
opcodestr, "$rd, $rs2, $rs1">;
multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
- def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>;
- def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">;
- def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">;
- def _AQ_RL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">;
+ def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>;
+ def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">;
+ def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">;
+ def _AQRL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">;
}
//===----------------------------------------------------------------------===//
@@ -174,8 +174,9 @@ let Predicates = [HasAtomicLdSt] in {
def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>;
def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>;
+}
- // Used by GISel for RV32 and RV64.
+let Predicates = [HasAtomicLdSt, IsRV32] in {
def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>;
}
@@ -188,31 +189,34 @@ let Predicates = [HasAtomicLdSt, IsRV64] in {
/// AMOs
+class PatAMO<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
+ : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs2, GPR:$rs1)>;
+
multiclass AMOPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
list<Predicate> ExtraPreds = []> {
let Predicates = !listconcat([HasStdExtA, NoStdExtZtso], ExtraPreds) in {
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"),
- !cast<RVInst>(BaseInst), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"),
- !cast<RVInst>(BaseInst#"_AQ"), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"),
- !cast<RVInst>(BaseInst#"_RL"), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"),
- !cast<RVInst>(BaseInst#"_AQ_RL"), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"),
- !cast<RVInst>(BaseInst#"_AQ_RL"), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"),
+ !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"),
+ !cast<RVInst>(BaseInst#"_AQ"), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"),
+ !cast<RVInst>(BaseInst#"_RL"), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+ !cast<RVInst>(BaseInst#"_AQRL"), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+ !cast<RVInst>(BaseInst#"_AQRL"), vt>;
}
let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in {
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"),
- !cast<RVInst>(BaseInst), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"),
- !cast<RVInst>(BaseInst), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"),
- !cast<RVInst>(BaseInst), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"),
- !cast<RVInst>(BaseInst), vt>;
- def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"),
- !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"),
+ !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"),
+ !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"),
+ !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+ !cast<RVInst>(BaseInst), vt>;
+ def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+ !cast<RVInst>(BaseInst), vt>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 115ab38e..0b5bee1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -175,7 +175,7 @@ def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
Sched<[]>;
}
-let Predicates = [UseCCMovInsn] in {
+let Predicates = [UseMIPSCCMovInsn] in {
def : Pat<(select (riscv_setne (XLenVT GPR:$rs2)),
(XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
(MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
index 7cf6d5f..20e2142 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
@@ -9,8 +9,8 @@
// This file describes the RISC-V instructions from the standard atomic 'Za*'
// extensions:
// - Zawrs (v1.0) : Wait-on-Reservation-Set.
-// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap.
-// - Zabha (v1.0-rc1) : Byte and Halfword Atomic Memory Operations.
+// - Zacas (v1.0) : Atomic Compare-and-Swap.
+// - Zabha (v1.0) : Byte and Halfword Atomic Memory Operations.
//
//===----------------------------------------------------------------------===//
@@ -44,15 +44,15 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "$rd = $rd_wb"
class AMO_cas<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr,
DAGOperand RC>
: RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
- (outs RC:$rd_wb), (ins RC:$rd, GPRMemZeroOffset:$rs1, RC:$rs2),
+ (outs RC:$rd_wb), (ins RC:$rd, RC:$rs2, GPRMemZeroOffset:$rs1),
opcodestr, "$rd, $rs2, $rs1">;
multiclass AMO_cas_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr,
DAGOperand RC> {
- def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>;
- def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>;
- def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>;
- def _AQ_RL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>;
+ def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>;
+ def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>;
+ def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>;
+ def _AQRL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>;
}
let Predicates = [HasStdExtZacas], IsSignExtendingOpW = 1 in {
@@ -71,48 +71,48 @@ defm AMOCAS_Q : AMO_cas_aq_rl<0b00101, 0b100, "amocas.q", GPRPairRV64>;
multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
list<Predicate> ExtraPreds = []> {
let Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) in {
- def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr),
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>;
+ (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>;
def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>;
+ (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>;
} // Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds)
let Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) in {
- def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr),
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
- def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr),
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+ def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (XLenVT GPR:$addr),
(vt GPR:$cmp),
(vt GPR:$new)),
- (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
+ (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
} // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds)
}
@@ -140,7 +140,7 @@ def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>;
// Zabha (Byte and Halfword Atomic Memory Operations)
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZabha] in {
+let Predicates = [HasStdExtZabha], IsSignExtendingOpW = 1 in {
defm AMOSWAP_B : AMO_rr_aq_rl<0b00001, 0b000, "amoswap.b">,
Sched<[WriteAtomicB, ReadAtomicBA, ReadAtomicBD]>;
defm AMOADD_B : AMO_rr_aq_rl<0b00000, 0b000, "amoadd.b">,
@@ -181,7 +181,7 @@ defm AMOMAXU_H : AMO_rr_aq_rl<0b11100, 0b001, "amomaxu.h">,
}
// If Zacas extension is also implemented, Zabha further provides AMOCAS.[B|H].
-let Predicates = [HasStdExtZabha, HasStdExtZacas] in {
+let Predicates = [HasStdExtZabha, HasStdExtZacas], IsSignExtendingOpW = 1 in {
defm AMOCAS_B : AMO_cas_aq_rl<0b00101, 0b000, "amocas.b", GPR>;
defm AMOCAS_H : AMO_cas_aq_rl<0b00101, 0b001, "amocas.h", GPR>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index 1dd7332..5f944034 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -30,21 +30,22 @@ class SRL_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
opcodestr, "$rs2, $rs1"> {
let rd = 0;
}
+
multiclass LAQ_r_aq_rl<bits<3> funct3, string opcodestr> {
- def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">;
- def _AQ_RL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">;
+ def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">;
+ def _AQRL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">;
}
multiclass SRL_r_aq_rl<bits<3> funct3, string opcodestr> {
- def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">;
- def _AQ_RL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">;
+ def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">;
+ def _AQRL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">;
}
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZalasr] in {
+let Predicates = [HasStdExtZalasr], IsSignExtendingOpW = 1 in {
defm LB : LAQ_r_aq_rl<0b000, "lb">;
defm LH : LAQ_r_aq_rl<0b001, "lh">;
defm LW : LAQ_r_aq_rl<0b010, "lw">;
@@ -93,11 +94,11 @@ let Predicates = [HasStdExtZalasr] in {
def : PatSRL<releasing_store<atomic_store_32>, SW_RL>;
def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>;
-} // Predicates = [HasStdExtZalasr]
+}
let Predicates = [HasStdExtZalasr, IsRV32] in {
- def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ>;
- def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ>;
+ def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>;
+ def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>;
} // Predicates = [HasStdExtZalasr, IsRV32]
let Predicates = [HasStdExtZalasr, IsRV64] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index ce21d83..8d9b777 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -808,9 +808,9 @@ multiclass Sh2Add_UWPat<Instruction sh2add_uw> {
}
multiclass Sh3Add_UWPat<Instruction sh3add_uw> {
- def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0xFFFFFFF8),
+ def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF),
(XLenVT GPR:$rs2))),
- (sh3add_uw (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>;
+ (sh3add_uw GPR:$rs1, GPR:$rs2)>;
// Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8),
(XLenVT GPR:$rs2))),
diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
index c81a20b..115a96e 100644
--- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
@@ -92,7 +92,7 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
const RISCVSubtarget &Subtarget = Fn.getSubtarget<RISCVSubtarget>();
- if (!Subtarget.useLoadStorePairs())
+ if (!Subtarget.useMIPSLoadStorePairs())
return false;
bool MadeChange = false;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3f2e7db..3e07eff 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -567,9 +567,12 @@ multiclass SiFive7WriteResBase<int VLEN,
defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, VLEN>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
- defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, [VCQ, VL],
- 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
- [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+ defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred,
+ // Predicated
+ [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)],
+ // Not Predicated
+ [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)],
+ mx, IsWorstCase>;
let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
defm : LMULWriteResMX<"WriteVLDUX8", [VCQ, VL], mx, IsWorstCase>;
defm : LMULWriteResMX<"WriteVLDOX8", [VCQ, VL], mx, IsWorstCase>;
@@ -587,9 +590,12 @@ multiclass SiFive7WriteResBase<int VLEN,
defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, VLEN>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
- defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, [VCQ, VL],
- 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
- [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+ defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred,
+ // Predicated
+ [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)],
+ // Not Predicated
+ [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)],
+ mx, IsWorstCase>;
let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
defm : LMULWriteResMX<"WriteVLDUX16", [VCQ, VL], mx, IsWorstCase>;
defm : LMULWriteResMX<"WriteVLDOX16", [VCQ, VL], mx, IsWorstCase>;
@@ -604,9 +610,12 @@ multiclass SiFive7WriteResBase<int VLEN,
defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, VLEN>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
- defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, [VCQ, VL],
- 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
- [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+ defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred,
+ // Predicated
+ [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)],
+ // Not Predicated
+ [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)],
+ mx, IsWorstCase>;
let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
defm : LMULWriteResMX<"WriteVLDUX32", [VCQ, VL], mx, IsWorstCase>;
defm : LMULWriteResMX<"WriteVLDOX32", [VCQ, VL], mx, IsWorstCase>;
@@ -621,9 +630,12 @@ multiclass SiFive7WriteResBase<int VLEN,
defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, VLEN>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
- defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, [VCQ, VL],
- 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
- [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+ defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred,
+ // Predicated
+ [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)],
+ // Not Predicated
+ [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)],
+ mx, IsWorstCase>;
let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
defm : LMULWriteResMX<"WriteVLDUX64", [VCQ, VL], mx, IsWorstCase>;
defm : LMULWriteResMX<"WriteVLDOX64", [VCQ, VL], mx, IsWorstCase>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 6c7658c..01a4308 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -67,42 +67,41 @@ multiclass LMULSEWWriteResMXSEW<string name, list<ProcResourceKind> resources,
// ReleaseAtCycles predCycles if the SchedPredicate Pred is true, otherwise has
// Latency noPredLat and ReleaseAtCycles noPredCycles. The WorstCase SchedWrite
// is created similarly if IsWorstCase is true.
-multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
- list<ProcResourceKind> resources,
- int predLat, list<int> predAcquireCycles,
- list<int> predReleaseCycles, int noPredLat,
- list<int> noPredAcquireCycles,
- list<int> noPredReleaseCycles,
- string mx, bit IsWorstCase> {
- defvar nameMX = name # "_" # mx;
-
+multiclass LMULWriteResVariantImpl<string name, string writeResName, SchedPredicateBase Pred,
+ list<ProcResourceKind> predResources,
+ int predLat, list<int> predAcquireCycles,
+ list<int> predReleaseCycles,
+ list<ProcResourceKind> noPredResources,
+ int noPredLat, list<int> noPredAcquireCycles,
+ list<int> noPredReleaseCycles,
+ bit IsWorstCase> {
// Define the different behaviors
- def nameMX # "_Pred" : SchedWriteRes<resources>{
+ def writeResName # "_Pred" : SchedWriteRes<predResources>{
let Latency = predLat;
let AcquireAtCycles = predAcquireCycles;
let ReleaseAtCycles = predReleaseCycles;
}
- def nameMX # "_NoPred" : SchedWriteRes<resources> {
+ def writeResName # "_NoPred" : SchedWriteRes<noPredResources> {
let Latency = noPredLat;
let AcquireAtCycles = noPredAcquireCycles;
let ReleaseAtCycles = noPredReleaseCycles;
}
// Define SchedVars
- def nameMX # PredSchedVar
- : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>;
- def nameMX # NoPredSchedVar
- : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>;
+ def writeResName # PredSchedVar
+ : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # writeResName # "_Pred")]>;
+ def writeResName # NoPredSchedVar
+ : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # writeResName #"_NoPred")]>;
// Allow multiclass to refer to SchedVars -- need to have NAME prefix.
- defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar);
- defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar);
+ defvar PredSchedVar = !cast<SchedVar>(NAME # writeResName # PredSchedVar);
+ defvar NoPredSchedVar = !cast<SchedVar>(NAME # writeResName # NoPredSchedVar);
// Tie behavior to predicate
- def NAME # nameMX # "_Variant"
+ def NAME # writeResName # "_Variant"
: SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
def : SchedAlias<
- !cast<SchedReadWrite>(nameMX),
- !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
+ !cast<SchedReadWrite>(writeResName),
+ !cast<SchedReadWrite>(NAME # writeResName # "_Variant")>;
if IsWorstCase then {
def NAME # name # "_WorstCase_Variant"
@@ -113,6 +112,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
}
}
+multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
+ list<ProcResourceKind> predResources,
+ int predLat, list<int> predAcquireCycles,
+ list<int> predReleaseCycles,
+ list<ProcResourceKind> noPredResources,
+ int noPredLat, list<int> noPredAcquireCycles,
+ list<int> noPredReleaseCycles,
+ string mx, bit IsWorstCase> {
+ defm "" : LMULWriteResVariantImpl<name, name # "_" # mx, Pred, predResources,
+ predLat, predAcquireCycles,
+ predReleaseCycles, noPredResources,
+ noPredLat, noPredAcquireCycles,
+ noPredReleaseCycles,
+ IsWorstCase>;
+}
+
// Define multiclasses to define SchedWrite, SchedRead, WriteRes, and
// ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
// SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index e35ffaf..715ac4c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -65,9 +65,9 @@ static cl::opt<bool> UseMIPSLoadStorePairsOpt(
cl::desc("Enable the load/store pair optimization pass"), cl::init(false),
cl::Hidden);
-static cl::opt<bool> UseCCMovInsn("use-riscv-ccmov",
- cl::desc("Use 'mips.ccmov' instruction"),
- cl::init(true), cl::Hidden);
+static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov",
+ cl::desc("Use 'mips.ccmov' instruction"),
+ cl::init(true), cl::Hidden);
void RISCVSubtarget::anchor() {}
@@ -246,10 +246,10 @@ void RISCVSubtarget::overridePostRASchedPolicy(
}
}
-bool RISCVSubtarget::useLoadStorePairs() const {
+bool RISCVSubtarget::useMIPSLoadStorePairs() const {
return UseMIPSLoadStorePairsOpt && HasVendorXMIPSLSP;
}
-bool RISCVSubtarget::useCCMovInsn() const {
- return UseCCMovInsn && HasVendorXMIPSCMov;
+bool RISCVSubtarget::useMIPSCCMovInsn() const {
+ return UseMIPSCCMovInsn && HasVendorXMIPSCMov;
}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 7dffa63..6acf799 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -227,8 +227,8 @@ public:
unsigned getXLen() const {
return is64Bit() ? 64 : 32;
}
- bool useLoadStorePairs() const;
- bool useCCMovInsn() const;
+ bool useMIPSLoadStorePairs() const;
+ bool useMIPSCCMovInsn() const;
unsigned getFLen() const {
if (HasStdExtD)
return 64;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ee25f69..7bc0b5b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2747,20 +2747,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
Intrinsic::ID IID = Inst->getIntrinsicID();
LLVMContext &C = Inst->getContext();
bool HasMask = false;
+
+ auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
+ bool IsWrite) -> int64_t {
+ if (auto *TarExtTy =
+ dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
+ return TarExtTy->getIntParameter(0);
+
+ return 1;
+ };
+
switch (IID) {
case Intrinsic::riscv_vle_mask:
case Intrinsic::riscv_vse_mask:
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask:
+ case Intrinsic::riscv_vsseg2_mask:
+ case Intrinsic::riscv_vsseg3_mask:
+ case Intrinsic::riscv_vsseg4_mask:
+ case Intrinsic::riscv_vsseg5_mask:
+ case Intrinsic::riscv_vsseg6_mask:
+ case Intrinsic::riscv_vsseg7_mask:
+ case Intrinsic::riscv_vsseg8_mask:
HasMask = true;
[[fallthrough]];
case Intrinsic::riscv_vle:
- case Intrinsic::riscv_vse: {
+ case Intrinsic::riscv_vse:
+ case Intrinsic::riscv_vlseg2:
+ case Intrinsic::riscv_vlseg3:
+ case Intrinsic::riscv_vlseg4:
+ case Intrinsic::riscv_vlseg5:
+ case Intrinsic::riscv_vlseg6:
+ case Intrinsic::riscv_vlseg7:
+ case Intrinsic::riscv_vlseg8:
+ case Intrinsic::riscv_vsseg2:
+ case Intrinsic::riscv_vsseg3:
+ case Intrinsic::riscv_vsseg4:
+ case Intrinsic::riscv_vsseg5:
+ case Intrinsic::riscv_vsseg6:
+ case Intrinsic::riscv_vsseg7:
+ case Intrinsic::riscv_vsseg8: {
// Intrinsic interface:
// riscv_vle(merge, ptr, vl)
// riscv_vle_mask(merge, ptr, mask, vl, policy)
// riscv_vse(val, ptr, vl)
// riscv_vse_mask(val, ptr, mask, vl, policy)
+ // riscv_vlseg#(merge, ptr, vl, sew)
+ // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
+ // riscv_vsseg#(val, ptr, vl, sew)
+ // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
bool IsWrite = Inst->getType()->isVoidTy();
Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
+ // The results of segment loads are TargetExtType.
+ if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
+ unsigned SEW =
+ 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
+ ->getZExtValue();
+ Ty = TarExtTy->getTypeParameter(0U);
+ Ty = ScalableVectorType::get(
+ IntegerType::get(C, SEW),
+ cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
+ }
const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
unsigned VLIndex = RVVIInfo->VLOperand;
unsigned PtrOperandNo = VLIndex - 1 - HasMask;
@@ -2771,23 +2823,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
if (HasMask)
Mask = Inst->getArgOperand(VLIndex - 1);
Value *EVL = Inst->getArgOperand(VLIndex);
+ unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
+ // RVV uses contiguous elements as a segment.
+ if (SegNum > 1) {
+ unsigned ElemSize = Ty->getScalarSizeInBits();
+ auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
+ Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
+ }
Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
Alignment, Mask, EVL);
return true;
}
case Intrinsic::riscv_vlse_mask:
case Intrinsic::riscv_vsse_mask:
+ case Intrinsic::riscv_vlsseg2_mask:
+ case Intrinsic::riscv_vlsseg3_mask:
+ case Intrinsic::riscv_vlsseg4_mask:
+ case Intrinsic::riscv_vlsseg5_mask:
+ case Intrinsic::riscv_vlsseg6_mask:
+ case Intrinsic::riscv_vlsseg7_mask:
+ case Intrinsic::riscv_vlsseg8_mask:
+ case Intrinsic::riscv_vssseg2_mask:
+ case Intrinsic::riscv_vssseg3_mask:
+ case Intrinsic::riscv_vssseg4_mask:
+ case Intrinsic::riscv_vssseg5_mask:
+ case Intrinsic::riscv_vssseg6_mask:
+ case Intrinsic::riscv_vssseg7_mask:
+ case Intrinsic::riscv_vssseg8_mask:
HasMask = true;
[[fallthrough]];
case Intrinsic::riscv_vlse:
- case Intrinsic::riscv_vsse: {
+ case Intrinsic::riscv_vsse:
+ case Intrinsic::riscv_vlsseg2:
+ case Intrinsic::riscv_vlsseg3:
+ case Intrinsic::riscv_vlsseg4:
+ case Intrinsic::riscv_vlsseg5:
+ case Intrinsic::riscv_vlsseg6:
+ case Intrinsic::riscv_vlsseg7:
+ case Intrinsic::riscv_vlsseg8:
+ case Intrinsic::riscv_vssseg2:
+ case Intrinsic::riscv_vssseg3:
+ case Intrinsic::riscv_vssseg4:
+ case Intrinsic::riscv_vssseg5:
+ case Intrinsic::riscv_vssseg6:
+ case Intrinsic::riscv_vssseg7:
+ case Intrinsic::riscv_vssseg8: {
// Intrinsic interface:
// riscv_vlse(merge, ptr, stride, vl)
// riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
// riscv_vsse(val, ptr, stride, vl)
// riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
+ // riscv_vlsseg#(merge, ptr, offset, vl, sew)
+ // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
+ // riscv_vssseg#(val, ptr, offset, vl, sew)
+ // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
bool IsWrite = Inst->getType()->isVoidTy();
Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
+ // The results of segment loads are TargetExtType.
+ if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
+ unsigned SEW =
+ 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
+ ->getZExtValue();
+ Ty = TarExtTy->getTypeParameter(0U);
+ Ty = ScalableVectorType::get(
+ IntegerType::get(C, SEW),
+ cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
+ }
const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
unsigned VLIndex = RVVIInfo->VLOperand;
unsigned PtrOperandNo = VLIndex - 2 - HasMask;
@@ -2809,6 +2910,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
if (HasMask)
Mask = Inst->getArgOperand(VLIndex - 1);
Value *EVL = Inst->getArgOperand(VLIndex);
+ unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
+ // RVV uses contiguous elements as a segment.
+ if (SegNum > 1) {
+ unsigned ElemSize = Ty->getScalarSizeInBits();
+ auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
+ Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
+ }
Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
Alignment, Mask, EVL, Stride);
return true;
@@ -2817,19 +2925,89 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::riscv_vluxei_mask:
case Intrinsic::riscv_vsoxei_mask:
case Intrinsic::riscv_vsuxei_mask:
+ case Intrinsic::riscv_vloxseg2_mask:
+ case Intrinsic::riscv_vloxseg3_mask:
+ case Intrinsic::riscv_vloxseg4_mask:
+ case Intrinsic::riscv_vloxseg5_mask:
+ case Intrinsic::riscv_vloxseg6_mask:
+ case Intrinsic::riscv_vloxseg7_mask:
+ case Intrinsic::riscv_vloxseg8_mask:
+ case Intrinsic::riscv_vluxseg2_mask:
+ case Intrinsic::riscv_vluxseg3_mask:
+ case Intrinsic::riscv_vluxseg4_mask:
+ case Intrinsic::riscv_vluxseg5_mask:
+ case Intrinsic::riscv_vluxseg6_mask:
+ case Intrinsic::riscv_vluxseg7_mask:
+ case Intrinsic::riscv_vluxseg8_mask:
+ case Intrinsic::riscv_vsoxseg2_mask:
+ case Intrinsic::riscv_vsoxseg3_mask:
+ case Intrinsic::riscv_vsoxseg4_mask:
+ case Intrinsic::riscv_vsoxseg5_mask:
+ case Intrinsic::riscv_vsoxseg6_mask:
+ case Intrinsic::riscv_vsoxseg7_mask:
+ case Intrinsic::riscv_vsoxseg8_mask:
+ case Intrinsic::riscv_vsuxseg2_mask:
+ case Intrinsic::riscv_vsuxseg3_mask:
+ case Intrinsic::riscv_vsuxseg4_mask:
+ case Intrinsic::riscv_vsuxseg5_mask:
+ case Intrinsic::riscv_vsuxseg6_mask:
+ case Intrinsic::riscv_vsuxseg7_mask:
+ case Intrinsic::riscv_vsuxseg8_mask:
HasMask = true;
[[fallthrough]];
case Intrinsic::riscv_vloxei:
case Intrinsic::riscv_vluxei:
case Intrinsic::riscv_vsoxei:
- case Intrinsic::riscv_vsuxei: {
+ case Intrinsic::riscv_vsuxei:
+ case Intrinsic::riscv_vloxseg2:
+ case Intrinsic::riscv_vloxseg3:
+ case Intrinsic::riscv_vloxseg4:
+ case Intrinsic::riscv_vloxseg5:
+ case Intrinsic::riscv_vloxseg6:
+ case Intrinsic::riscv_vloxseg7:
+ case Intrinsic::riscv_vloxseg8:
+ case Intrinsic::riscv_vluxseg2:
+ case Intrinsic::riscv_vluxseg3:
+ case Intrinsic::riscv_vluxseg4:
+ case Intrinsic::riscv_vluxseg5:
+ case Intrinsic::riscv_vluxseg6:
+ case Intrinsic::riscv_vluxseg7:
+ case Intrinsic::riscv_vluxseg8:
+ case Intrinsic::riscv_vsoxseg2:
+ case Intrinsic::riscv_vsoxseg3:
+ case Intrinsic::riscv_vsoxseg4:
+ case Intrinsic::riscv_vsoxseg5:
+ case Intrinsic::riscv_vsoxseg6:
+ case Intrinsic::riscv_vsoxseg7:
+ case Intrinsic::riscv_vsoxseg8:
+ case Intrinsic::riscv_vsuxseg2:
+ case Intrinsic::riscv_vsuxseg3:
+ case Intrinsic::riscv_vsuxseg4:
+ case Intrinsic::riscv_vsuxseg5:
+ case Intrinsic::riscv_vsuxseg6:
+ case Intrinsic::riscv_vsuxseg7:
+ case Intrinsic::riscv_vsuxseg8: {
// Intrinsic interface (only listed ordered version):
// riscv_vloxei(merge, ptr, index, vl)
// riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
// riscv_vsoxei(val, ptr, index, vl)
// riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
+ // riscv_vloxseg#(merge, ptr, index, vl, sew)
+ // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
+ // riscv_vsoxseg#(val, ptr, index, vl, sew)
+ // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
bool IsWrite = Inst->getType()->isVoidTy();
Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
+ // The results of segment loads are TargetExtType.
+ if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
+ unsigned SEW =
+ 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
+ ->getZExtValue();
+ Ty = TarExtTy->getTypeParameter(0U);
+ Ty = ScalableVectorType::get(
+ IntegerType::get(C, SEW),
+ cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
+ }
const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
unsigned VLIndex = RVVIInfo->VLOperand;
unsigned PtrOperandNo = VLIndex - 2 - HasMask;
@@ -2845,6 +3023,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
Mask = ConstantInt::getTrue(MaskType);
}
Value *EVL = Inst->getArgOperand(VLIndex);
+ unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
+ // RVV uses contiguous elements as a segment.
+ if (SegNum > 1) {
+ unsigned ElemSize = Ty->getScalarSizeInBits();
+ auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
+ Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
+ }
Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
Align(1), Mask, EVL,
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 8e7e2e5..f1d487c87 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -326,6 +326,15 @@ void SparcAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) {
Sparc_MC::verifyInstructionPredicates(MI->getOpcode(),
getSubtargetInfo().getFeatureBits());
+ if (MI->isBundle()) {
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ emitInstruction(&*I);
+ ++I;
+ }
+ return;
+ }
switch (MI->getOpcode()) {
default: break;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index a160709..cbb7db6 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -3557,3 +3558,28 @@ void SparcTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (!Node->hasAnyUseOfValue(0))
MI.getOperand(0).setReg(SP::G0);
}
+
+Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ bool HasStoreSemantics =
+ isa<AtomicCmpXchgInst, AtomicRMWInst, StoreInst>(Inst);
+ if (HasStoreSemantics && isReleaseOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Release);
+ return nullptr;
+}
+
+Instruction *SparcTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // V8 loads already come with implicit acquire barrier so there's no need to
+ // emit it again.
+ bool HasLoadSemantics = isa<AtomicCmpXchgInst, AtomicRMWInst, LoadInst>(Inst);
+ if (Subtarget->isV9() && HasLoadSemantics && isAcquireOrStronger(Ord))
+ return Builder.CreateFence(AtomicOrdering::Acquire);
+
+ // SC plain stores would need a trailing full barrier.
+ if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
+ return Builder.CreateFence(Ord);
+ return nullptr;
+}
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index e7040f7..f3efd94 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -183,6 +183,11 @@ namespace llvm {
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
// FIXME: We insert fences for each atomics and generate
// sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index e28f445..f66eb9d 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -653,6 +653,23 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addImm(Offset);
return true;
}
+ case SP::V8BAR: {
+ assert(!Subtarget.isV9() &&
+ "V8BAR should not be emitted on V9 processors!");
+
+ // Emit stbar; ldstub [%sp-1], %g0
+ // The sequence acts as a full barrier on V8 systems.
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr &InstSTBAR =
+ *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::STBAR));
+ MachineInstr &InstLDSTUB =
+ *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::LDSTUBri), SP::G0)
+ .addReg(SP::O6)
+ .addImm(-1);
+ MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB);
+ MBB.erase(MI);
+ return true;
+ }
}
return false;
}
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 97e7fd7..bc192c2 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -578,6 +578,9 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
let isPseudo = 1;
}
+// Full memory barrier for V8.
+def V8BAR : Pseudo<(outs), (ins), "!V8BAR", []>, Requires<[HasNoV9]>;
+
// GETPCX for PIC
let Defs = [O7] in {
def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
@@ -1974,12 +1977,30 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
-// store bar for all atomic_fence in V8.
-let Predicates = [HasNoV9] in
- def : Pat<(atomic_fence timm, timm), (STBAR)>;
+// All load-type operations in V8 comes with implicit acquire semantics.
+let Predicates = [HasNoV9] in {
+ // Acquire -> nop
+ def : Pat<(atomic_fence (i32 4), timm), (NOP)>;
+ // Release / AcqRel -> stbar
+ def : Pat<(atomic_fence (i32 5), timm), (STBAR)>;
+ // AcqRel and stronger -> stbar; ldstub [%sp-1], %g0
+ def : Pat<(atomic_fence timm, timm), (V8BAR)>;
+}
-let Predicates = [HasV9] in
+// We have to handle both 32 and 64-bit cases.
+let Predicates = [HasV9] in {
+ // Acquire -> membar #LoadLoad | #LoadStore
+ def : Pat<(atomic_fence (i32 4), timm), (MEMBARi 0x5)>;
+ def : Pat<(atomic_fence (i64 4), timm), (MEMBARi 0x5)>;
+ // Release -> membar #LoadStore | #StoreStore
+ def : Pat<(atomic_fence (i32 5), timm), (MEMBARi 0xc)>;
+ def : Pat<(atomic_fence (i64 5), timm), (MEMBARi 0xc)>;
+ // AcqRel -> membar #LoadLoad | #LoadStore | #StoreStore
+ def : Pat<(atomic_fence (i32 6), timm), (MEMBARi 0xd)>;
+ def : Pat<(atomic_fence (i64 6), timm), (MEMBARi 0xd)>;
+ // SeqCst -> membar #StoreLoad | #LoadLoad | #LoadStore | #StoreStore
def : Pat<(atomic_fence timm, timm), (MEMBARi 0xf)>;
+}
// atomic_load addr -> load addr
def : Pat<(i32 (atomic_load_azext_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 163bf9b..6472334 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3209,7 +3209,8 @@ static SDValue performAnyAllCombine(SDNode *N, SelectionDAG &DAG) {
using namespace llvm::SDPatternMatch;
SDValue LHS;
- if (!sd_match(N->getOperand(1),
+ if (N->getNumOperands() < 2 ||
+ !sd_match(N->getOperand(1),
m_c_SetCC(m_Value(LHS), m_Zero(), m_CondCode())))
return SDValue();
EVT LT = LHS.getValueType();