diff options
Diffstat (limited to 'llvm/lib/Target')
68 files changed, 1836 insertions, 902 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 076a623..639ddcb 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -69,7 +69,6 @@ def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, - fconstant_to_constant, icmp_redundant_trunc, fold_global_offset, shuffle_to_extract, @@ -341,7 +340,7 @@ def AArch64PostLegalizerLowering : GICombiner<"AArch64PostLegalizerLoweringImpl", [shuffle_vector_lowering, vashr_vlshr_imm, icmp_lowering, build_vector_lowering, - lower_vector_fcmp, form_truncstore, + lower_vector_fcmp, form_truncstore, fconstant_to_constant, vector_sext_inreg_to_shift, unmerge_ext_to_unmerge, lower_mulv2s64, vector_unmerge_lowering, insertelt_nonconst, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index c197550e..9e2d698 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64); getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({s32, s64, s128}) - .legalFor(HasFP16, {s16}) + // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT + .legalFor({s16, s32, s64, s128}) .clampScalar(0, MinFPScalar, s128); // FIXME: fix moreElementsToNextPow2 diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 63313da..23dcaea 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -75,6 +75,31 @@ struct ShuffleVectorPseudo { ShuffleVectorPseudo() = default; }; +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 16 && DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector /// sources of the shuffle are different. std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 8c10673..896eab5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -44,31 +44,6 @@ namespace { #include "AArch64GenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES -/// Return true if a G_FCONSTANT instruction is known to be better-represented -/// as a G_CONSTANT. -bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - Register DstReg = MI.getOperand(0).getReg(); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - if (DstSize != 32 && DstSize != 64) - return false; - - // When we're storing a value, it doesn't matter what register bank it's on. - // Since not all floating point constants can be materialized using a fmov, - // it makes more sense to just use a GPR. - return all_of(MRI.use_nodbg_instructions(DstReg), - [](const MachineInstr &Use) { return Use.mayStore(); }); -} - -/// Change a G_FCONSTANT into a G_CONSTANT. -void applyFConstantToConstant(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - MachineIRBuilder MIB(MI); - const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); - MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); - MI.eraseFromParent(); -} - /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits /// are sign bits. In this case, we can transform the G_ICMP to directly compare /// the wide value with a zero. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index f90bcc7..830a35bb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -590,6 +590,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, unsigned Depth) const { switch (MI.getOpcode()) { case AArch64::G_DUP: + case AArch64::G_SADDLP: + case AArch64::G_UADDLP: case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: case TargetOpcode::G_EXTRACT_VECTOR_ELT: @@ -798,6 +800,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (Ty.isVector()) OpRegBankIdx[Idx] = PMI_FirstFPR; else if (isPreISelGenericFloatingPointOpcode(Opc) || + (MO.isDef() && onlyDefinesFP(MI, MRI, TRI)) || + (MO.isUse() && onlyUsesFP(MI, MRI, TRI)) || Ty.getSizeInBits() > 64) OpRegBankIdx[Idx] = PMI_FirstFPR; else diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1a697f7..ea32748 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2592,6 +2592,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " + "!Subtarget->d16PreservesUnusedBits()">; + def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, @@ -2769,6 +2772,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, AssemblerPredicate<(all_of FeatureMAIInsts)>; +def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">, + AssemblerPredicate<(all_of (not FeatureMAIInsts))>; + def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">, AssemblerPredicate<(all_of FeatureSMemRealTime)>; @@ -2943,6 +2949,20 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic( def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; +def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; + +//===----------------------------------------------------------------------===// +// HwModes +//===----------------------------------------------------------------------===// + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; + +// gfx1250, has alignment requirement but no AGPRs. +def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; + + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936..ef58004 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + // TODO: Overestimates due to not accounting for tied operands + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast<StructType>(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + unsigned RegCount = 0; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') { + RegCount = NumRegs; + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } + + continue; + } + + if (CI.Type == InlineAsm::isOutput) { + // Apply tuple alignment requirement + // + // TODO: This is more conservative than necessary. + AGPRDefCount = alignTo(AGPRDefCount, RegCount); + + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } + } else { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + + // TODO: This is overly conservative. If there are any physical registers, + // allocate any virtual registers after them so we don't have to solve optimal + // packing. + return std::min(MaxVirtReg + MaxPhysReg, 256u); } // TODO: Migrate to range merge of amdgpu-agpr-alloc. @@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { const Function *Callee = dyn_cast<Function>(CalleeOp); if (!Callee) { if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) - return !inlineAsmUsesAGPRs(IA); + return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; return false; } - // Some intrinsics may use AGPRs, but if we have a choice, we are not - // required to use AGPRs. - if (Callee->isIntrinsic()) + switch (Callee->getIntrinsicID()) { + case Intrinsic::not_intrinsic: + break; + case Intrinsic::write_register: + case Intrinsic::read_register: + case Intrinsic::read_volatile_register: { + const MDString *RegName = cast<MDString>( + cast<MDNode>( + cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata()) + ->getOperand(0)); + auto [Kind, RegIdx, NumRegs] = + AMDGPU::parseAsmPhysRegName(RegName->getString()); + return Kind != 'a'; + } + default: + // Some intrinsics may use AGPRs, but if we have a choice, we are not + // required to use AGPRs. return true; + } // TODO: Handle callsite attributes const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( @@ -1504,7 +1584,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); @@ -1515,6 +1594,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!F->isDeclaration() && ST.hasClusters()) A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); + if (ST.hasGFX90AInsts()) + A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + for (auto &I : instructions(F)) { Value *Ptr = nullptr; if (auto *LI = dyn_cast<LoadInst>(&I)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2192a72..e4d328a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -393,12 +393,13 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { - const MCInstrDesc &Desc = - Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + const MCInstrDesc &Desc = TII->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; - int RegClass = Desc.operands()[OpIdx].RegClass; + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]); if (RegClass == -1) return nullptr; @@ -4353,7 +4354,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (!RC || SIRI->isSGPRClass(RC)) return false; - if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { + if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass && + RC != &AMDGPU::VS_64_Align2RegClass) { AllUsesAcceptSReg = false; SDNode *User = U->getUser(); if (User->isMachineOpcode()) { @@ -4367,7 +4369,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { const TargetRegisterClass *CommutedRC = getOperandRegClass(U->getUser(), CommutedOpNo); if (CommutedRC == &AMDGPU::VS_32RegClass || - CommutedRC == &AMDGPU::VS_64RegClass) + CommutedRC == &AMDGPU::VS_64RegClass || + CommutedRC == &AMDGPU::VS_64_Align2RegClass) AllUsesAcceptSReg = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 848d9a5..557d87f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe2..c7a91f4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,8 +929,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } } }); @@ -964,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } - if (EnableAMDGPUAttributor) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; @@ -1296,7 +1298,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a67a7be..a8140c3 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1347,6 +1347,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ForcedDPP = false; bool ForcedSDWA = false; KernelScopeInfo KernelScope; + const unsigned HwMode; /// @name Auto-generated Match Functions /// { @@ -1356,6 +1357,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } + /// Get size of register operand + unsigned getRegOperandSize(const MCInstrDesc &Desc, unsigned OpNo) const { + assert(OpNo < Desc.NumOperands); + int16_t RCID = MII.getOpRegClassID(Desc.operands()[OpNo], HwMode); + return getRegBitWidth(RCID) / 8; + } + private: void createConstantSymbol(StringRef Id, int64_t Val); @@ -1442,9 +1450,9 @@ public: using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>; AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) { + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI, MII), Parser(_Parser), + HwMode(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)) { MCAsmParserExtension::Initialize(Parser); setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); @@ -1944,6 +1952,7 @@ public: void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); + void cvtOpSelHelper(MCInst &Inst, unsigned OpSel); bool parseDimId(unsigned &Encoding); ParseStatus parseDim(OperandVector &Operands); @@ -4106,7 +4115,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc) { if ((DMaskIdx == -1 || TFEIdx == -1) && isGFX10_AEncoding()) // intersect_ray return true; - unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); + unsigned VDataSize = getRegOperandSize(Desc, VDataIdx); unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) @@ -4170,8 +4179,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; unsigned ActualAddrSize = - IsNSA ? SrsrcIdx - VAddr0Idx - : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + IsNSA ? SrsrcIdx - VAddr0Idx : getRegOperandSize(Desc, VAddr0Idx) / 4; unsigned ExpectedAddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); @@ -4181,8 +4189,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { ExpectedAddrSize > getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) { int VAddrLastIdx = SrsrcIdx - 1; - unsigned VAddrLastSize = - AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4; + unsigned VAddrLastSize = getRegOperandSize(Desc, VAddrLastIdx) / 4; ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize; } @@ -4428,7 +4435,8 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, return true; const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (TRI->getRegClass(Desc.operands()[0].RegClass).getSizeInBits() <= 128) + if (TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[0], HwMode)) + .getSizeInBits() <= 128) return true; if (TRI->regsOverlap(Src2Reg, DstReg)) { @@ -4999,7 +5007,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + AMDGPU::isDPALU_DPP(MII.get(Opc), MII, getSTI())) { // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); @@ -5522,7 +5530,8 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, unsigned Fmt = Inst.getOperand(FmtIdx).getImm(); int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp); unsigned RegSize = - TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits(); + TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[SrcIdx], HwMode)) + .getSizeInBits(); if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32) return true; @@ -9239,6 +9248,33 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { MCOI::OperandConstraint::TIED_TO) == -1; } +void AMDGPUAsmParser::cvtOpSelHelper(MCInst &Inst, unsigned OpSel) { + unsigned Opc = Inst.getOpcode(); + constexpr AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; + constexpr AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but + // no src1. So continue instead of break. + continue; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + // op_sel[3] is encoded in src0_modifiers. + if (ModOps[J] == AMDGPU::OpName::src0_modifiers && (OpSel & (1 << 3)) != 0) + ModVal |= SISrcMods::DST_OP_SEL; + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; @@ -9275,6 +9311,16 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + + // Some v_interp instructions use op_sel[3] for dst. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOpSel); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + cvtOpSelHelper(Inst, OpSel); + } } void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) @@ -9310,31 +9356,10 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) if (OpSelIdx == -1) return; - const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, - AMDGPU::OpName::src2}; - const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; - unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); - - for (int J = 0; J < 3; ++J) { - int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); - if (OpIdx == -1) - break; - - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); - - if ((OpSel & (1 << J)) != 0) - ModVal |= SISrcMods::OP_SEL_0; - if (ModOps[J] == AMDGPU::OpName::src0_modifiers && - (OpSel & (1 << 3)) != 0) - ModVal |= SISrcMods::DST_OP_SEL; - - Inst.getOperand(ModIdx).setImm(ModVal); - } + cvtOpSelHelper(Inst, OpSel); } + void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 09a66d7..b97b738 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -417,10 +417,10 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> { } class getMUBUFInsDA<list<RegisterOperand> vdataList, - list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> { + list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> { RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getBUFVDataRegisterOperandForOp<vdataClass, isTFE>.ret; + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); @@ -453,8 +453,8 @@ class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, (ins)))))); } @@ -677,8 +677,8 @@ class MUBUF_Pseudo_Store_Lds<string opName> } class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset, - list<RegisterClass> vaddrList=[]> { - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + list<RegisterClassLike> vaddrList=[]> { + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); @@ -702,9 +702,9 @@ class getMUBUFAtomicIns<int addrKind, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, (ins)))))); } @@ -1568,11 +1568,12 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar data_op = getVregSrcForVT<data_vt>.ret; + defvar data_vt_RC = getVregClassForVT<data_vt>.ret; let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) - data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, + data_op:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)), @@ -1583,7 +1584,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string >; defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) - data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + data_op:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), @@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT<vt>.ret; - defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar DataRC = getVregClassForVT<data_vt>.ret; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3); @@ -2088,7 +2089,7 @@ defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>; multiclass MUBUFScratchStorePat_Common <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset)), @@ -2104,7 +2105,7 @@ multiclass MUBUFScratchStorePat_Common <string Instr, multiclass MUBUFScratchStorePat <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { let SubtargetPredicate = HasUnrestrictedSOffset in { defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>; } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index b2ff5a1..d0ad120 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -904,7 +904,7 @@ let SubtargetPredicate = isGFX1250Plus in { let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>; -defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>; +defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96_Align1>; } // End OtherPredicates = [HasTransposeLoadF4F6Insts] defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>; @@ -934,7 +934,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>; defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>; defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>; - defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>; + defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96_Align1>; } //===----------------------------------------------------------------------===// @@ -951,6 +951,11 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat (inst $ptr, Offset:$offset, (i1 gds)) >; +class DSReadPat_t16 <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (EXTRACT_SUBREG (inst $ptr, Offset:$offset, (i1 gds)), lo16) +>; + multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -968,13 +973,14 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> { def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; } - let OtherPredicates = [NotLDSRequiresM0Init] in { - let True16Predicate = NotUseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; - } - let True16Predicate = UseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; - } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = NotUseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = UseTrue16WithSramECC in { + def : DSReadPat_t16<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2120bf8..f11b373 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -57,7 +57,9 @@ static int64_t getInlineImmVal64(unsigned Imm); AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), + MAI(*Ctx.getAsmInfo()), + HwModeRegClass(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), + TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) { // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) @@ -825,7 +827,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) { + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + if (Desc.TSFlags & SIInstrFlags::MIMG) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int RsrcIdx = @@ -838,7 +841,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, for (unsigned i = 0; i < NSAArgs; ++i) { const unsigned VAddrIdx = VAddr0Idx + 1 + i; auto VAddrRCID = - MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; + MCII->getOpRegClassID(Desc.operands()[VAddrIdx], HwModeRegClass); MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i])); } Bytes = Bytes.slice(4 * NSAWords); @@ -1311,7 +1314,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { // Widen the register to the correct number of enabled channels. MCRegister NewVdata; if (DstSize != Info->VDataDwords) { - auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass; + auto DataRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VDataIdx], HwModeRegClass); // Get first subregister of VData MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg(); @@ -1338,7 +1342,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; - auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; + auto AddrRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VAddrSAIdx], HwModeRegClass); + const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID); NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC); NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 935c383..2751857 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -41,6 +41,7 @@ private: std::unique_ptr<MCInstrInfo const> const MCII; const MCRegisterInfo &MRI; const MCAsmInfo &MAI; + const unsigned HwModeRegClass; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5a22b23..6de59be 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -229,7 +229,7 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { class FLAT_Load_Pseudo< string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let OutOperandList = (outs vdata_op:$vdst); @@ -268,7 +268,7 @@ multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> { + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let InOperandList = !con( (ins VaddrRC:$vaddr, vdataClass:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), @@ -385,7 +385,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy (outs ), !con( !if(IsAsync, (ins VGPR_32:$vdst), (ins)), - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins flat_offset:$offset, CPol_0:$cpol)), !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let LGKM_CNT = !not(IsAsync); @@ -417,7 +417,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata), (ins flat_offset:$offset, CPol_0:$cpol)), " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let VM_CNT = 0; @@ -511,7 +511,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } -class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> : FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { let has_vdst = 0; let has_data = 0; @@ -533,7 +533,7 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { multiclass FLAT_Global_Prefetch_Pseudo<string opName> { let is_flat_global = 1, has_saddr = 1 in { - def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, GlobalSaddrTable<1, opName> { @@ -754,7 +754,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< RegisterOperand data_op = vdst_op> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), " $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName> { let FPAtomic = data_vt.isFP; @@ -786,7 +786,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_vgpr:$vdst), - (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; @@ -811,7 +811,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_agpr:$vdst), - (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn_agpr"> { let FPAtomic = data_vt.isFP; @@ -837,7 +837,7 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType data_vt = vt, RegisterOperand data_op = vdst_op, bit EnableSaddr = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> { let InOperandList = !con( (ins VaddrRC:$vaddr, data_op:$vdata), @@ -867,7 +867,7 @@ class FLAT_Global_Atomic_Pseudo_RTN< RegisterOperand data_op = vdst_op, bit EnableSaddr = false, bit IsVGPR = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> { defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret); @@ -1321,7 +1321,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { } let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { - defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>; + defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96_Align1>; defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>; } @@ -1383,6 +1383,11 @@ class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType v (inst $vaddr, $offset, (i32 0)) >; +class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1393,6 +1398,11 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value (inst $vaddr, $offset, (i32 0)) >; +class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset, (i32 0)), lo16) +>; + class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) @@ -1408,6 +1418,11 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1443,6 +1458,11 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu (inst $saddr, $voffset, $offset, $cpol) >; +class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1519,7 +1539,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1548,7 +1568,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1592,7 +1612,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1625,6 +1645,11 @@ class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $offset, 0) >; +class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) @@ -1645,6 +1670,11 @@ class ScratchLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Val (inst $saddr, $offset, 0) >; +class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), + (EXTRACT_SUBREG (inst $saddr, $offset), lo16) +>; + class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), @@ -1672,6 +1702,11 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16) +>; + multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat_M0 <inst, node> { let AddedComplexity = 10; @@ -1764,6 +1799,16 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value } } +multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatSignedLoadPat_t16<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStoreSignedPat <inst, node, vt> { @@ -1872,8 +1917,8 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, } } -multiclass ScratchFLATStorePats_t16<string inst, SDPatternOperator node, - ValueType vt> { +multiclass ScratchFLATStorePats_D16_t16<string inst, SDPatternOperator node, + ValueType vt> { def : ScratchStoreSignedPat <!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> { let AddedComplexity = 25; } @@ -1918,6 +1963,21 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : ScratchLoadSignedPat_t16 <inst, node, vt> { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadPat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1947,6 +2007,17 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT } } +multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_t16 <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } + + def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStorePat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1997,6 +2068,17 @@ let True16Predicate = NotUseRealTrue16Insts in { defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } +let True16Predicate = UseTrue16WithSramECC in { + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; @@ -2006,11 +2088,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; +} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; -} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts +} defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; @@ -2140,6 +2225,20 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; } +let True16Predicate = UseTrue16WithSramECC in { +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; @@ -2192,6 +2291,13 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; } +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = UseRealTrue16Insts in { +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", store_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>; +} + let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; @@ -2362,14 +2468,24 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>; } -let True16Predicate = UseRealTrue16Insts in { +let True16Predicate = UseTrue16WithSramECC in { +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>; +} + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; -} // End True16Predicate = UseRealTrue16Insts +} // End OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts + +let True16Predicate = UseRealTrue16Insts in { +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; +} foreach vt = Reg32Types.types in { defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 8821558..464cbec 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -722,7 +722,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && - AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *TII, *ST)) { LLVM_DEBUG(dbgs() << " " << OrigMI << " failed: not valid 64-bit DPP control value\n"); break; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 1d9a427..a911e7e 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -869,7 +869,7 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); int VDataRCID = -1; if (VDataIdx != -1) - VDataRCID = Desc.operands()[VDataIdx].RegClass; + VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]); if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { // There is no hazard if the instruction does not use vector regs @@ -893,8 +893,8 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. if (TII->isMIMG(MI)) { int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); - assert(SRsrcIdx != -1 && - AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); + assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID( + Desc.operands()[SRsrcIdx])) == 256); (void)SRsrcIdx; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index d3b5718..e82f998 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -788,9 +788,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Check if operand register class contains register used. // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. - int RCID = Desc.operands()[OpNo].RegClass; + const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); if (RCID != -1) { - const MCRegisterClass RC = MRI.getRegClass(RCID); + const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) @@ -1025,7 +1027,7 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && - AMDGPU::isDPALU_DPP(Desc, STI)) { + AMDGPU::isDPALU_DPP(Desc, MII, STI)) { O << " /* DP ALU dpp only supports " << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; @@ -1280,6 +1282,17 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but no + // src1. + if (NumOps == 1 && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2) && + !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1)) { + Ops[NumOps++] = DefaultValue; // Set src1_modifiers to default. + int Mod2Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); + assert(Mod2Idx != -1); + Ops[NumOps++] = MI->getOperand(Mod2Idx).getImm(); + } + const bool HasDst = (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst) != -1) || (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst) != -1); diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 291c03a..64e34db 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1516,7 +1516,8 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> { int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11))); RegisterOperand RegClass = MIMGAddrSize<num_addrs, 0>.RegClass; - defvar Size = !cast<SIRegisterClass>(RegClass.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RegClass.RegClass).Size; + int VAddrDwords = !srl(Size, 5); int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 90c828b..6616b30 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1077,7 +1077,7 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - int16_t RCID = Desc.operands()[UseOpIdx].RegClass; + int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]); if (RCID == -1) return false; @@ -1299,10 +1299,8 @@ void SIFoldOperandsImpl::foldOperand( AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO, AMDGPU::AV_MOV_B64_IMM_PSEUDO}) { const MCInstrDesc &MovDesc = TII->get(MovOp); - assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1); - const TargetRegisterClass *MovDstRC = - TRI->getRegClass(MovDesc.operands()[0].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0])); // Fold if the destination register class of the MOV instruction (ResRC) // is a superclass of (or equal to) the destination register class of the @@ -1312,7 +1310,8 @@ void SIFoldOperandsImpl::foldOperand( const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); + if (MovSrcRC) { if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e233457..1a686a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17346,74 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); - // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better balance register - // use between vgpr and agpr as agpr tuples tend to be big. - if (!MI.getDesc().operands().empty()) { - unsigned Opc = MI.getOpcode(); - bool HasAGPRs = Info->mayNeedAGPRs(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - for (auto I : - {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { - if (I == -1) - break; - if ((I == Src2Idx) && (HasAGPRs)) - break; - MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !Op.getReg().isVirtual()) - continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); - if (!TRI->hasAGPRs(RC)) - continue; - auto *Src = MRI.getUniqueVRegDef(Op.getReg()); - if (!Src || !Src->isCopy() || - !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) - continue; - auto *NewRC = TRI->getEquivalentVGPRClass(RC); - // All uses of agpr64 and agpr32 can also accept vgpr except for - // v_accvgpr_read, but we do not produce agpr reads during selection, - // so no use checks are needed. - MRI.setRegClass(Op.getReg(), NewRC); - } - - if (TII->isMAI(MI)) { - // The ordinary src0, src1, src2 were legalized above. - // - // We have to also legalize the appended v_mfma_ld_scale_b32 operands, - // as a separate instruction. - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src0); - if (Src0Idx != -1) { - int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src1); - if (TII->usesConstantBus(MRI, MI, Src0Idx) && - TII->usesConstantBus(MRI, MI, Src1Idx)) - TII->legalizeOpWithMove(MI, Src1Idx); - } - } - - if (!HasAGPRs) - return; - - // Resolve the rest of AV operands to AGPRs. - if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { - if (Src2->isReg() && Src2->getReg().isVirtual()) { - auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); - if (TRI->isVectorSuperClass(RC)) { - auto *NewRC = TRI->getEquivalentAGPRClass(RC); - MRI.setRegClass(Src2->getReg(), NewRC); - if (Src2->isTied()) - MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); - } - } + if (TII->isMAI(MI)) { + // The ordinary src0, src1, src2 were legalized above. + // + // We have to also legalize the appended v_mfma_ld_scale_b32 operands, + // as a separate instruction. + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); + if (Src0Idx != -1) { + int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src1); + if (TII->usesConstantBus(MRI, MI, Src0Idx) && + TII->usesConstantBus(MRI, MI, Src1Idx)) + TII->legalizeOpWithMove(MI, Src1Idx); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 46757cf..ec5c5bb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5029,9 +5029,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } - int RegClass = Desc.operands()[i].RegClass; - const MCOperandInfo &OpInfo = Desc.operands()[i]; + int16_t RegClass = getOpRegClassID(OpInfo); + switch (OpInfo.OperandType) { case MCOI::OPERAND_REGISTER: if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { @@ -5635,7 +5635,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && - AMDGPU::isDPALU_DPP(Desc, ST)) { + AMDGPU::isDPALU_DPP(Desc, *this, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; @@ -6031,48 +6031,17 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -static const TargetRegisterClass * -adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, - const MCInstrDesc &TID, unsigned RCID) { - if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) { - switch (RCID) { - case AMDGPU::AV_32RegClassID: - RCID = AMDGPU::VGPR_32RegClassID; - break; - case AMDGPU::AV_64RegClassID: - RCID = AMDGPU::VReg_64RegClassID; - break; - case AMDGPU::AV_96RegClassID: - RCID = AMDGPU::VReg_96RegClassID; - break; - case AMDGPU::AV_128RegClassID: - RCID = AMDGPU::VReg_128RegClassID; - break; - case AMDGPU::AV_160RegClassID: - RCID = AMDGPU::VReg_160RegClassID; - break; - case AMDGPU::AV_512RegClassID: - RCID = AMDGPU::VReg_512RegClassID; - break; - default: - break; - } - } - - return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); -} - +// FIXME: This should not be an overridable function. All subtarget dependent +// operand modifications should go through isLookupRegClassByHwMode in the +// generic handling. const TargetRegisterClass * SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const { if (OpNum >= TID.getNumOperands()) return nullptr; - auto RegClass = TID.operands()[OpNum].RegClass; - // Special pseudos have no alignment requirement. - if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID)) - return RI.getRegClass(RegClass); - - return adjustAllocatableRegClass(ST, RI, TID, RegClass); + const MCOperandInfo &OpInfo = TID.operands()[OpNum]; + int16_t RegClass = getOpRegClassID(OpInfo); + return RI.getRegClass(RegClass); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -6090,8 +6059,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getPhysRegBaseClass(Reg); } - unsigned RCID = Desc.operands()[OpNo].RegClass; - return adjustAllocatableRegClass(ST, RI, Desc, RCID); + return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6099,7 +6067,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; + unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]); const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Size = RI.getRegSizeInBits(*RC); unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO @@ -6168,7 +6136,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, Register Reg = MO.getReg(); - const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); + const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo)); if (Reg.isPhysical()) return DRC->contains(Reg); @@ -6293,8 +6261,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; + int64_t RegClass = getOpRegClassID(OpInfo); const TargetRegisterClass *DefinedRC = - OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + RegClass != -1 ? RI.getRegClass(RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); @@ -7619,7 +7588,7 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx, if (!RI.isVGPRClass(CurrRC)) return; - unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]); const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID); if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) { Op.setSubReg(AMDGPU::lo16); @@ -9323,7 +9292,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, // Is this operand statically required to be an SGPR based on the operand // constraints? const TargetRegisterClass *OpRC = - RI.getRegClass(Desc.operands()[Idx].RegClass); + RI.getRegClass(getOpRegClassID(Desc.operands()[Idx])); bool IsRequiredSGPR = RI.isSGPRClass(OpRC); if (IsRequiredSGPR) return MO.getReg(); @@ -9804,7 +9773,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (Idx == -1) // e.g. s_memtime return false; - const auto RCID = MI.getDesc().operands()[Idx].RegClass; + const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]); return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index cc59acf..e979eeb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1298,7 +1298,7 @@ public: return 4; } - return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; + return RI.getRegSizeInBits(*RI.getRegClass(getOpRegClassID(OpInfo))) / 8; } /// This form should usually be preferred since it handles operands diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 18a5393..b7f63ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1151,6 +1151,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +// FIXME: Should change class based on hasSDWAScalar to exclude SGPRs class SDWASrc<ValueType vt> : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; string Type = !if(vt.isFP, "FP", "INT"); @@ -1807,13 +1808,13 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16, VOPDstOperand_t16Lo128), VOPDstOperand<VGPR_32>); - RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>, - !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, - !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, - !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, - !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, - !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, - !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024_AlignTarget>, + !eq(VT.Size, 512) : VOPDstOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 256) : VOPDstOperand<VReg_256_AlignTarget>, + !eq(VT.Size, 192) : VOPDstOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : VOPDstOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : VOPDstOperand<VReg_64_AlignTarget>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, 1 : VOPDstS64orS32); // else VT == i1 @@ -1821,8 +1822,8 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { class getVALUDstForVT_fake16<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, - !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, - !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128_AlignTarget>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64_AlignTarget>, !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32_Lo128>, VOPDstS64orS32)))); // else VT == i1 } @@ -1890,21 +1891,38 @@ class getSOPSrcForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32); } -// Returns the vreg register class to use for source operand given VT +// Returns the vreg register operand to use for source operand given VT. +// This should only be used for a target instruction's ins list. class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { RegisterOperand ret = - !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>, - !eq(VT.Size, 192) : RegisterOperand<VReg_192>, - !eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 192) : RegisterOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128), RegisterOperand<VGPR_32>), 1 : RegisterOperand<VGPR_32>); } +// Returns a concrete vgpr register class to use for a value type VT, +// which exists separately from a real instruction use. +class getVregClassForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { + RegisterClass ret = + !cond(!eq(VT.Size, 512) : VReg_512, + !eq(VT.Size, 192) : VReg_192, + !eq(VT.Size, 128) : VReg_128, + !eq(VT.Size, 96) : VReg_96, + !eq(VT.Size, 64) : VReg_64, + !eq(VT.Size, 48) : VReg_64, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128), + VGPR_32), + 1 : VGPR_32); +} + class getSDWASrcForVT <ValueType VT> { RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); @@ -2638,7 +2656,7 @@ class getAlign2RegOp<RegisterOperand RC> { } class getEquivalentAGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = !cond(!eq(Size, 32) : RegisterOperand<AGPR_32>, !eq(Size, 64) : RegisterOperand<AReg_64>, @@ -2649,16 +2667,33 @@ class getEquivalentAGPROperand<RegisterOperand RC> { } class getEquivalentVGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = - !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>, - !eq(Size, 64) : RegisterOperand<VReg_64>, - !eq(Size, 96) : RegisterOperand<VReg_96>, - !eq(Size, 128) : RegisterOperand<VReg_128>, - !eq(Size, 160) : RegisterOperand<VReg_160>, - !eq(Size, 1024) : RegisterOperand<VReg_1024>); + !cond( + !eq(RC, VGPROp_32) : VGPROp_32, + !eq(RC, VGPROp_64) : VGPROp_64, + + !eq(RC, AVLdSt_32) : VGPROp_32, + !eq(RC, AVLdSt_64) : VGPROp_64, + !eq(RC, AVLdSt_96) : VGPROp_96, + !eq(RC, AVLdSt_128) : VGPROp_128, + !eq(RC, AVLdSt_160) : VGPROp_160, + !eq(RC, AVLdSt_1024) : VGPROp_1024, + + !eq(RC, AVLdSt_64_Align2) : VGPROp_64_Align2, + !eq(RC, AVLdSt_96_Align2) : VGPROp_96_Align2, + !eq(RC, AVLdSt_128_Align2) : VGPROp_128_Align2, + !eq(RC, AVLdSt_160_Align2) : VGPROp_160_Align2, + !eq(RC, AVLdSt_1024_Align2) : VGPROp_1024_Align2, + + !eq(RC, AVLdSt_64_Align1) : VGPROp_64_Align1, + !eq(RC, AVLdSt_96_Align1) : VGPROp_96_Align1, + !eq(RC, AVLdSt_128_Align1) : VGPROp_128_Align1, + !eq(RC, AVLdSt_160_Align1) : VGPROp_160_Align1, + !eq(RC, AVLdSt_1024_Align1) : VGPROp_1024_Align1); } + class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32, ValueType Src2VT = i32> { bit ret = !if(!eq(DstVT.Size, 64), @@ -3190,7 +3225,7 @@ class Commutable_REV <string revOp, bit isOrig> { // Interpolation opcodes //===----------------------------------------------------------------------===// -class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">; +class VINTRPDstOperand <RegisterClassLike rc> : RegisterOperand <rc, "printVINTRPDst">; class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index be084a9..eac9fd4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -120,6 +120,8 @@ def ATOMIC_FENCE : SPseudoInstSI< let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns +// No align needed as it will be decomposed anyway +// TODO: Remove alignment requirement from sources def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { let isPseudo = 1; @@ -129,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0)> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; @@ -163,9 +165,6 @@ def AV_MOV_B32_IMM_PSEUDO // 64-bit materialize immediate which supports AGPR or VGPR. This has // an unusual operand restriction which requires the two halves of the // immediate to each be 32-bit inline immediate values. -// -// FIXME: This unnecessarily has the even aligned vector register -// requirement applied. def AV_MOV_B64_IMM_PSEUDO : VPseudoInstSI<(outs AV_64:$vdst), (ins AV_64_PSEUDO_IMM:$src0)> { let isReMaterializable = 1; @@ -381,13 +380,13 @@ foreach Op = Operations in { let usesCustomInserter = 1, Defs = [VCC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] >; def V_SUB_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] >; } // End usesCustomInserter = 1, Defs = [VCC] @@ -1142,7 +1141,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register // needs to be used and an extra instruction to move between VGPR and AGPR. // UsesTmp adds to the total size of an expanded spill in this case. -multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, +multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class, bit UsesTmp = 0, bit HasMask = 0> { let UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] in { @@ -1177,21 +1176,25 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, } // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] } +// TODO: Technically the AlignTarget register class constraint is +// overly conservative for gfx90a. There is an alignment requirement, +// but the underlying spill will be lowered to 32-bit accesses. + defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>; defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; -defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; -defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; -defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; -defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; -defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; -defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; -defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; -defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; -defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; -defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; -defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; -defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; -defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64_AlignTarget>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96_AlignTarget>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128_AlignTarget>; +defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160_AlignTarget>; +defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192_AlignTarget>; +defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224_AlignTarget>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256_AlignTarget>; +defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288_AlignTarget>; +defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320_AlignTarget>; +defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352_AlignTarget>; +defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384_AlignTarget>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512_AlignTarget>; +defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024_AlignTarget>; let Defs = [M0] in { // Spills a block of 32 VGPRs. M0 will contain a mask describing which @@ -1200,34 +1203,34 @@ let Defs = [M0] in { } defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; -defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; -defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; -defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; -defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; -defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; -defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; -defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; -defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; -defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; -defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; -defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; -defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; -defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; +defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64_AlignTarget, 1>; +defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96_AlignTarget, 1>; +defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128_AlignTarget, 1>; +defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160_AlignTarget, 1>; +defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192_AlignTarget, 1>; +defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224_AlignTarget, 1>; +defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256_AlignTarget, 1>; +defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288_AlignTarget, 1>; +defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320_AlignTarget, 1>; +defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352_AlignTarget, 1>; +defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384_AlignTarget, 1>; +defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512_AlignTarget, 1>; +defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024_AlignTarget, 1>; defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; -defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; -defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; -defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; -defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; -defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; -defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; -defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; -defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; -defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; -defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; -defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; -defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; -defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; +defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64_AlignTarget, 1>; +defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96_AlignTarget, 1>; +defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128_AlignTarget, 1>; +defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160_AlignTarget, 1>; +defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192_AlignTarget, 1>; +defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224_AlignTarget, 1>; +defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256_AlignTarget, 1>; +defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288_AlignTarget, 1>; +defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320_AlignTarget, 1>; +defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352_AlignTarget, 1>; +defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384_AlignTarget, 1>; +defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512_AlignTarget, 1>; +defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024_AlignTarget, 1>; let isConvergent = 1 in { defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; @@ -2383,18 +2386,24 @@ let True16Predicate = UseRealTrue16Insts in { } } -// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit -// immediate and wil be expanded as needed, but we will only use these patterns -// for values which can be encoded. -def : GCNPat < - (VGPRImm<(i64 imm)>:$imm), - (V_MOV_B64_PSEUDO imm:$imm) ->; +/// FIXME: Increasing the priority of VGPRImm over the scalar forms as +/// a workaround for a phase ordering problem caused by overly +/// conservative MachineCSE. If we end up with an s_mov_b64 + copy to +/// vgpr pattern, MachineCSE will not perform the CSE which occurs +/// after operand folding. +let AddedComplexity = 1 in { + // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit + // immediate and wil be expanded as needed, but we will only use these patterns + // for values which can be encoded. + def : GCNPat < + (VGPRImm<(i64 imm)>:$imm), + (V_MOV_B64_PSEUDO imm:$imm)>; -def : GCNPat < - (VGPRImm<(f64 fpimm)>:$imm), - (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) ->; + def : GCNPat < + (VGPRImm<(f64 fpimm)>:$imm), + (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) + >; +} // End let AddedComplexity = 2 def : GCNPat < (i64 imm:$imm), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856..b398db4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,17 +33,20 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -static cl::opt<bool> MFMAVGPRForm( - "amdgpu-mfma-vgpr-form", cl::Hidden, +static cl::opt<bool, true> MFMAVGPRFormOpt( + "amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), + cl::Hidden); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); } +bool SIMachineFunctionInfo::MFMAVGPRForm = false; + SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), @@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); if (ST.hasGFX90AInsts()) { - // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection - // should be separated from availability of AGPRs - if (MFMAVGPRForm || - (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() && - !mayUseAGPRs(F))) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the + // allocation granule and clamping. + auto [MinNumAGPRAttr, MaxNumAGPRAttr] = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u}, + /*OnlyFirstRequired=*/true); + MinNumAGPRs = MinNumAGPRAttr; } if (AMDGPU::isChainCC(CC)) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4560615..b7dbb59 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -509,7 +509,9 @@ private: // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; - bool MayNeedAGPRs : 1; + /// Minimum number of AGPRs required to allocate in the function. Only + /// relevant for gfx90a-gfx950. For gfx908, this should be infinite. + unsigned MinNumAGPRs = ~0u; // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since @@ -537,6 +539,8 @@ private: void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; public: + static bool MFMAVGPRForm; + struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; @@ -1196,9 +1200,7 @@ public: unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } - bool mayNeedAGPRs() const { - return MayNeedAGPRs; - } + unsigned getMinNumAGPRs() const { return MinNumAGPRs; } // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index afe76e1..bfac639 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1338,8 +1338,9 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; unsigned I = Op.getOperandNo(); - if (Desc.operands()[I].RegClass == -1 || - !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); + if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) continue; if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3115579..be1c883 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -328,7 +328,8 @@ struct SGPRSpillBuilder { SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), ST.getAMDGPUDwarfFlavour(), - /*PC=*/0, ST.getHwMode()), + /*PC=*/0, + ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 82fc240..fc8f46a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -91,16 +91,23 @@ class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0, int Index = !cast<int>(regIdx); } -// For register classes that use TSFlags. -class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> - : RegisterClass <n, rTypes, Align, rList> { +class SIRegisterClassLike<int BW = 0, bit V = false, + bit A = false, + bit S = false> { + // Bitwidth of the register + field int Size = BW; + // For vector register classes. - field bit HasVGPR = 0; - field bit HasAGPR = 0; + field bit HasVGPR = V; + field bit HasAGPR = A; // For scalar register classes. - field bit HasSGPR = 0; + field bit HasSGPR = S; +} +// For register classes that use TSFlags. +class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> + : RegisterClass <n, rTypes, Align, rList>, SIRegisterClassLike { // Alignment of the first register in tuple (in 32-bit units). field int RegTupleAlignUnits = 1; @@ -991,7 +998,8 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1, BaseClassPriority = 1 in { + let HasVGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeVReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1031,7 +1039,8 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeAReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1197,15 +1206,87 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 } //===----------------------------------------------------------------------===// -// Register operands +// +// AlignTarget classes. Artifical classes to swap between +// even-aligned and any-aligned classes depending on subtarget. +// //===----------------------------------------------------------------------===// +def AV_LdSt_32_Target : RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + let DecoderMethod = "decodeAVLdSt"; +} + +foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { + def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; + } + + def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [!cast<RegisterClass>("AReg_"#RegSize), + !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/]> { + let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; + } + + def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; + } + + def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize)]> { + let DecoderMethod = "decodeAVLdSt"; + } +} + +def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VS_64, VS_64_Align2, VS_64_Align2]> { + let DecoderMethod = "decodeSrcRegOrImm9"; +} + class RegImmMatcher<string name> : AsmOperandClass { let Name = name; let RenderMethod = "addRegOrImmOperands"; } -class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> +class RegOrImmOperand <RegisterClassLike RegClass, string OperandTypeName> : RegisterOperand<RegClass> { let OperandNamespace = "AMDGPU"; let OperandType = OperandTypeName; @@ -1213,14 +1294,18 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> } //===----------------------------------------------------------------------===// +// Register operands +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate // if supported by target. //===----------------------------------------------------------------------===// -class SrcRegOrImm9<RegisterClass regClass, string operandType> +class SrcRegOrImm9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { string DecoderMethodName = "decodeSrcRegOrImm9"; - let DecoderMethod = DecoderMethodName # "<" # regClass.Size # ">"; + let DecoderMethod = DecoderMethodName # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16> @@ -1277,12 +1362,12 @@ def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">; def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">; def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">; def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">; -def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_INT64">; -def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { +def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">; +def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> { let DecoderMethod = "decodeOperand_VSrc_f64"; } -def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; -def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">; +def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">; def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; @@ -1292,19 +1377,19 @@ def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16 // This is for operands with the enum(9), VSrc encoding restriction, // but only allows VGPRs. -class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> { - let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">"; +class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> { + let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } def VRegSrc_32 : SrcReg9<VGPR_32>; -def VRegSrc_64 : SrcReg9<VReg_64>; -def VRegSrc_96 : SrcReg9<VReg_96>; -def VRegSrc_128 : SrcReg9<VReg_128>; -def VRegSrc_192 : SrcReg9<VReg_192>; -def VRegSrc_256 : SrcReg9<VReg_256>; -def VRegSrc_384 : SrcReg9<VReg_384>; -def VRegSrc_512 : SrcReg9<VReg_512>; -def VRegSrc_1024 : SrcReg9<VReg_1024>; +def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>; +def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>; +def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>; +def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>; +def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>; +def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>; +def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>; +def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; // True 16 Operands @@ -1325,23 +1410,23 @@ class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> { class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> { let DecoderMethod = "Decode" # regClass # "RegisterClass"; } -multiclass VGPROp_Aligned<RegisterClass regClass> { - def _Align1 : VGPROp<regClass>; - def _Align2 : VGPROp_Align2<regClass>; -} // TODO: These cases should use default target alignment def VGPROp_16 : VGPROp<VGPR_16> { let EncoderMethod = "getMachineOpValueT16"; } + def VGPROp_32 : VGPROp<VGPR_32>; foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "320", "352", "384", "512", "1024"] in { - def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>; -} + // Target default alignment + def VGPROp_#size : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_AlignTarget)>; + + // No alignment requirement + def VGPROp_#size#_Align1 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size)>; -foreach size = ["64", "96", "128", "160", "256", "1024"] in { - defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>; + // Always even alignment requirement + def VGPROp_#size#_Align2 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_Align2)>; } def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { @@ -1357,9 +1442,9 @@ def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// -class AVOperand<RegisterClass regClass, string decoder> +class AVOperand<RegisterClassLike regClass, string decoder> : RegisterOperand<regClass> { - let DecoderMethod = decoder # "<" # regClass.Size # ">"; + let DecoderMethod = decoder # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; let EncoderMethod = "getAVOperandEncoding"; } @@ -1374,13 +1459,13 @@ def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">; def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">; def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">; def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">; -def VCSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_INT64">; -def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; +def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; -def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">; def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; // True 16 Operands @@ -1391,73 +1476,80 @@ def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">; // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// -def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; //===----------------------------------------------------------------------===// // AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR //===----------------------------------------------------------------------===// -class AVSrcOperand<RegisterClass regClass> +class AVSrcOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeSrcAV10">; def AVSrc_32 : AVSrcOperand<AV_32>; -def AVSrc_64 : AVSrcOperand<AV_64>; -def AVSrc_128 : AVSrcOperand<AV_128>; -def AVSrc_192 : AVSrcOperand<AV_192>; -def AVSrc_256 : AVSrcOperand<AV_256>; +def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>; +def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>; +def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>; +def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>; -class AVDstOperand<RegisterClass regClass> +def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>; +def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>; +def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>; +def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>; + +class AVDstOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAV10">; def AVDst_128 : AVDstOperand<AV_128>; def AVDst_256 : AVDstOperand<AV_256>; def AVDst_512 : AVDstOperand<AV_512>; -class AVLdStOperand<RegisterClass regClass> +def AVDst_128_Align2 : AVDstOperand<AV_128_Align2>; +def AVDst_256_Align2 : AVDstOperand<AV_256_Align2>; +def AVDst_512_Align2 : AVDstOperand<AV_512_Align2>; + +class AVLdStOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAVLdSt">; -def AVLdSt_32 : AVLdStOperand<AV_32>; +def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>; foreach size = ["64", "96", "128", "160", "256", "1024" ] in { - // TODO: These cases should use target align variant - def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - - def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>; + def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>; + def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>; + def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>; } //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// -class SrcRegOrImmA9<RegisterClass regClass, string operandType> +class SrcRegOrImmA9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { - let DecoderMethod = "decodeSrcRegOrImmA9<" # regClass.Size # ">"; + let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } -def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; //===----------------------------------------------------------------------===// // Tablegen programming utilities @@ -1467,10 +1559,10 @@ def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; /// instruction's operand list, which may be a RegisterOperand or a /// direct RegisterClass reference. class getRegClassFromOp<DAGOperand Op> { - SIRegisterClass ret = !if( + SIRegisterClassLike ret = !if( !isa<RegisterOperand>(Op), - !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass), - !cast<SIRegisterClass>(Op)); + !cast<SIRegisterClassLike>(!cast<RegisterOperand>(Op).RegClass), + !cast<SIRegisterClassLike>(Op)); } /// Check if the operand will use an AV_* class. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index f7f4d46..76023d2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1569,12 +1569,7 @@ static bool isValidRegPrefix(char C) { return C == 'v' || C == 's' || C == 'a'; } -std::tuple<char, unsigned, unsigned> -parseAsmConstraintPhysReg(StringRef Constraint) { - StringRef RegName = Constraint; - if (!RegName.consume_front("{") || !RegName.consume_back("}")) - return {}; - +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef RegName) { char Kind = RegName.front(); if (!isValidRegPrefix(Kind)) return {}; @@ -1601,6 +1596,14 @@ parseAsmConstraintPhysReg(StringRef Constraint) { return {}; } +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint) { + StringRef RegName = Constraint; + if (!RegName.consume_front("{") || !RegName.consume_back("}")) + return {}; + return parseAsmPhysRegName(RegName); +} + std::pair<unsigned, unsigned> getIntegerPairAttribute(const Function &F, StringRef Name, std::pair<unsigned, unsigned> Default, @@ -2927,13 +2930,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { return getRegBitWidth(RC.getID()); } -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo) { - assert(OpNo < Desc.NumOperands); - unsigned RCID = Desc.operands()[OpNo].RegClass; - return getRegBitWidth(RCID) / 8; -} - bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { if (isInlinableIntLiteral(Literal)) return true; @@ -3499,14 +3495,18 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { return false; } -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) { int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); if (Idx == -1) continue; - if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID || - OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID) + const MCOperandInfo &OpInfo = OpDesc.operands()[Idx]; + int16_t RegClass = MII.getOpRegClassID( + OpInfo, ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); + if (RegClass == AMDGPU::VReg_64RegClassID || + RegClass == AMDGPU::VReg_64_Align2RegClassID) return true; } @@ -3533,14 +3533,15 @@ bool isDPALU_DPP32BitOpc(unsigned Opc) { } } -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) return false; if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); - return hasAny64BitVGPROperands(OpDesc); + return hasAny64BitVGPROperands(OpDesc, MII, ST); } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2b9c063..49b4d02 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1014,6 +1014,13 @@ bool isReadOnlySegment(const GlobalValue *GV); bool shouldEmitConstantsToTextSection(const Triple &TT); /// Returns a valid charcode or 0 in the first entry if this is a valid physical +/// register name. Followed by the start register number, and the register +/// width. Does not validate the number of registers exists in the class. Unlike +/// parseAsmConstraintPhysReg, this does not expect the name to be wrapped in +/// "{}". +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef TupleString); + +/// Returns a valid charcode or 0 in the first entry if this is a valid physical /// register constraint. Followed by the start register number, and the register /// width. Does not validate the number of registers exists in the class. std::tuple<char, unsigned, unsigned> @@ -1620,10 +1627,6 @@ unsigned getRegBitWidth(unsigned RCID); /// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); -/// Get size of register operand -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo); - LLVM_READNONE inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { switch (OpInfo.OperandType) { @@ -1780,13 +1783,15 @@ inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { } /// \returns true if an instruction may have a 64-bit VGPR operand. -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, + const MCSubtargetInfo &ST); /// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. bool isDPALU_DPP32BitOpc(unsigned Opc); /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 30dab55..d87d250 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -405,7 +405,7 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> { field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), !if(!eq(vt.Size, 64), - (ins VSrc_f64:$src0, VReg_64:$src1, ImmOpType:$imm), + (ins VSrc_f64:$src0, VReg_64_AlignTarget:$src1, ImmOpType:$imm), (ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm))); field dag InsVOPDX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm); let InsVOPDX_immX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immX); @@ -474,10 +474,10 @@ def VOP_MADMK_F64 : VOP_MADMK <f64>; // given VT. class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> { RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, RegisterOperand<VGPR_32>, RegisterOperand<VGPR_16>), diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 4a2b54d..42ec8ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -97,6 +97,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : VOP3_Pseudo<OpName, P, pattern> { let AsmMatchConverter = "cvtVOP3Interp"; let mayRaiseFPException = 0; + let VOP3_OPSEL = P.HasOpSel; } def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { @@ -119,16 +120,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { let HasSrc0Mods = 0; } -class getInterp16Asm <bit HasSrc2, bit HasOMod> { +class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> { string src2 = !if(HasSrc2, ", $src2_modifiers", ""); string omod = !if(HasOMod, "$omod", ""); + string opsel = !if(OpSel, "$op_sel", ""); string ret = - " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod; + " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel; } class getInterp16Ins <bit HasSrc2, bit HasOMod, - Operand Src0Mod, Operand Src2Mod> { - dag ret = !if(HasSrc2, + Operand Src0Mod, Operand Src2Mod, bit OpSel> { + dag ret1 = !if(HasSrc2, !if(HasOMod, (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, InterpAttr:$attr, InterpAttrChan:$attrchan, @@ -143,19 +145,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, InterpAttr:$attr, InterpAttrChan:$attrchan, highmod:$high, Clamp0:$clamp, omod0:$omod) ); + dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins)); + dag ret = !con(ret1, ret2); } -class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { +class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> { let IsSingle = 1; let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; + let HasOpSel = OpSel; let Src0Mod = FPVRegInputMods; let Src2Mod = FPVRegInputMods; let Outs64 = (outs DstRC.RegClass:$vdst); - let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret; - let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret; + let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret; + let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret; } //===----------------------------------------------------------------------===// @@ -480,7 +485,7 @@ let SubtargetPredicate = isGFX9Plus in { defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>; defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>; let OtherPredicates = [isNotGFX90APlus] in -def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; +def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>; } // End SubtargetPredicate = isGFX9Plus // This predicate should only apply to the selection pattern. The @@ -2676,6 +2681,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> } } +multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> { + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); + let AsmString = AsmName # ps.AsmOperands; + } +} + multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> { @@ -2788,7 +2801,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">; defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">; -defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">; +defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">; defm V_ADD_I32 : VOP3_Real_vi <0x29c>; defm V_SUB_I32 : VOP3_Real_vi <0x29d>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5daf860..7cfd059 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3P_Mix_Profile<P, Features, 0> { let IsTrue16 = 1; - let IsRealTrue16 = 1; + let IsRealTrue16 = 1; let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; } @@ -705,16 +705,16 @@ foreach Type = ["U", "I"] in (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand<AGPR_32>; -def ADst_64 : VOPDstOperand<AReg_64>; -def ADst_128 : VOPDstOperand<AReg_128>; -def ADst_256 : VOPDstOperand<AReg_256>; -def ADst_512 : VOPDstOperand<AReg_512>; -def ADst_1024 : VOPDstOperand<AReg_1024>; -def VDst_64 : VOPDstOperand<VReg_64>; -def VDst_128 : VOPDstOperand<VReg_128>; -def VDst_256 : VOPDstOperand<VReg_256>; -def VDst_512 : VOPDstOperand<VReg_512>; -def VDst_1024 : VOPDstOperand<VReg_1024>; +def ADst_64 : VOPDstOperand<AReg_64_AlignTarget>; +def ADst_128 : VOPDstOperand<AReg_128_AlignTarget>; +def ADst_256 : VOPDstOperand<AReg_256_AlignTarget>; +def ADst_512 : VOPDstOperand<AReg_512_AlignTarget>; +def ADst_1024 : VOPDstOperand<AReg_1024_AlignTarget>; +def VDst_64 : VOPDstOperand<VReg_64_AlignTarget>; +def VDst_128 : VOPDstOperand<VReg_128_AlignTarget>; +def VDst_256 : VOPDstOperand<VReg_256_AlignTarget>; +def VDst_512 : VOPDstOperand<VReg_512_AlignTarget>; +def VDst_1024 : VOPDstOperand<VReg_1024_AlignTarget>; def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { let Src0RC64 = ARegSrc_32; @@ -811,23 +811,23 @@ def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; -def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; - -def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; +def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; + +def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>; def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>; @@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> : } // Currently assumes scaled instructions never have abid -class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag < +class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), @@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = (ops node:$blgp))), !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), - (Op $src0, $src1, $src2, $cbsz, $blgp))), - pred ->; - -defvar MayNeedAGPRs = [{ - return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; - -defvar MayNeedAGPRs_gisel = [{ - return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + (Op $src0, $src1, $src2, $cbsz, $blgp)))>; -defvar MayNotNeedAGPRs = [{ - return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; +class CanUseAGPR_MAI<ValueType vt> { + code PredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; -defvar MayNotNeedAGPRs_gisel = [{ - return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + code GISelPredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; +} -class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, +class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true, bit Scaled = false> : - MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNeedAGPRs_gisel; -} + MAIFrag<Op, HasAbid, Scaled>, + CanUseAGPR_MAI<vt>; class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, - bit Scaled = false> : - MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNotNeedAGPRs_gisel; -} + bit Scaled = false> : + MAIFrag<Op, HasAbid, Scaled>; let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, bit HasAbid = true, bit Scaled = false> { defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap; + defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P); + defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"); + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { - def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def _e64 : MAIInst<OpName, ProfileAGPR, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<0, "AGPR", NAME # "_e64">; let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in - def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR, !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; } @@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR, + !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; let OtherPredicates = [isGFX90APlus] in - def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR, !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; } @@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; defvar HasAbid = false; - - defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap; + defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl); + defvar NoDstOverlap = Profile.NoDstOverlap; def _e64 : ScaledMAIInst<OpName, - !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>, + !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>, MFMATable<0, "AGPR", NAME # "_e64">; def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd", @@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = UnscaledOpName_ in { def _mac_e64 : ScaledMAIInst<OpName # "_mac", - !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>, + !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>, MFMATable<1, "AGPR", NAME # "_e64">; def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd", diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 631f0f3..8325c62 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -419,6 +419,13 @@ class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, let Inst{14-11} = scale_sel; } +class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> { + let Inst{11} = src0_modifiers{2}; + // There's no src1 + let Inst{13} = src2_modifiers{2}; + let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 2e8a676..ce1cdb3 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -232,6 +232,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::SP); markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); + markSuperRegs(Reserved, ARM::FPSCR_RM); markSuperRegs(Reserved, ARM::APSR_NZCV); if (TFI->isFPReserved(MF)) markSuperRegs(Reserved, STI.getFramePointerReg()); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index f4ac6bb..2a40fb9 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 31650e0..6771106 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -435,14 +435,14 @@ def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">; // FP Binary Operations. // -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", @@ -453,21 +453,21 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", @@ -478,42 +478,42 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", @@ -524,21 +524,21 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", @@ -549,7 +549,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", @@ -589,7 +589,7 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>; multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", - isUnpredicable = 1 in { + isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"), @@ -621,7 +621,7 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b), (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; // These are encoded as unary instructions. -let Defs = [FPSCR_NZCV] in { +let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", "", @@ -684,7 +684,7 @@ def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0, IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm", [(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>; -let Defs = [FPSCR_NZCV] in { +let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", "", @@ -742,6 +742,7 @@ def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0, } } // Defs = [FPSCR_NZCV] +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", @@ -762,6 +763,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, } // Special case encoding: bits 11-8 is 0b1011. +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", [(set SPR:$Sd, (fpround DPR:$Dm))]>, @@ -787,7 +789,7 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, } // Between half, single and double-precision. -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", "", [/* Intentionally left blank, see patterns below */]>, @@ -799,7 +801,7 @@ def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda", [/* Intentionally left blank, see patterns below */]>, @@ -821,7 +823,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", "", [/* Intentionally left blank, see patterns below */]>, @@ -835,7 +837,7 @@ def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda", [/* Intentionally left blank, see patterns below */]>, @@ -853,6 +855,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", "", @@ -876,6 +879,7 @@ def : FP16Pat<(f64 (f16_to_fp GPR:$a)), (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm), NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda", @@ -901,6 +905,7 @@ def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), (i32 (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$a), GPR))>, Requires<[HasFPARMv8, HasDPVFP]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm", "", @@ -915,6 +920,7 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, let hasSideEffects = 0; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm), NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda", @@ -934,7 +940,8 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, multiclass vcvt_inst<string opc, bits<2> rm, SDPatternOperator node = null_frag> { - let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0, + mayRaiseFPException = 1 in { def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0, (outs SPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"), @@ -1055,7 +1062,9 @@ def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0, IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm", [(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>; -multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { +multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node, + list<Register> uses = [], bit fpexc = 0> { + let Uses = uses, mayRaiseFPException = fpexc in { def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm", @@ -1081,6 +1090,7 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { let Inst{7} = op2; let Inst{16} = op; } + } def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"), (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>, @@ -1093,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm<string opc, bits<2> rm, SDPatternOperator node = null_frag> { @@ -1140,18 +1150,21 @@ defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", @@ -1486,6 +1499,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let hasSideEffects = 0; } +let mayRaiseFPException = 1 in def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", @@ -1502,6 +1516,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSITOD (VLDRS addrmode5:$a))>; } +let mayRaiseFPException = 1 in def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd),(ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", @@ -1520,6 +1535,7 @@ def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; +let mayRaiseFPException = 1 in def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm", @@ -1532,6 +1548,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; +let mayRaiseFPException = 1 in def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", @@ -1548,6 +1565,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VUITOD (VLDRS addrmode5:$a))>; } +let mayRaiseFPException = 1 in def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", @@ -1566,6 +1584,7 @@ def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; +let mayRaiseFPException = 1 in def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm", @@ -1640,6 +1659,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, } // Always set Z bit in the instruction, i.e. "round towards zero" variants. +let mayRaiseFPException = 1 in def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", @@ -1660,6 +1680,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; } +let mayRaiseFPException = 1 in def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", @@ -1684,6 +1705,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; +let mayRaiseFPException = 1 in def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm", @@ -1698,6 +1720,7 @@ def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; +let mayRaiseFPException = 1 in def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", @@ -1718,6 +1741,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; } +let mayRaiseFPException = 1 in def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", @@ -1742,6 +1766,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; +let mayRaiseFPException = 1 in def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm", @@ -1757,7 +1782,7 @@ def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. -let Uses = [FPSCR] in { +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm", @@ -1807,7 +1832,7 @@ def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let Inst{7} = 0; // Z bit let isUnpredicable = 1; } -} +} // mayRaiseFPException = 1, Uses = [FPSCR_RM] // v8.3-a Javascript Convert to Signed fixed-point def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011, @@ -1825,7 +1850,7 @@ def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011, // S32 (U=0, sx=1) -> SL // U32 (U=1, sx=1) -> UL -let Constraints = "$a = $dst" in { +let Constraints = "$a = $dst", mayRaiseFPException = 1 in { // FP to Fixed-Point: @@ -2026,9 +2051,10 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1, IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>, Sched<[WriteFPCVT]>; -} // End of 'let Constraints = "$a = $dst" in' +} // End of 'let Constraints = "$a = $dst", mayRaiseFPException = 1 in' // BFloat16 - Single precision, unary, predicated +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in class BF16_VCVT<string opc, bits<2> op7_6> : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm), VFPUnaryFrm, NoItinerary, @@ -2063,6 +2089,7 @@ def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>; // FP Multiply-Accumulate Operations. // +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAD : ADbI<0b11100, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", @@ -2072,6 +2099,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAS : ASbIn<0b11100, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", @@ -2085,6 +2113,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAH : AHbI<0b11100, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm", @@ -2104,6 +2133,7 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSD : ADbI<0b11100, 0b00, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", @@ -2113,6 +2143,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSS : ASbIn<0b11100, 0b00, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", @@ -2126,6 +2157,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSH : AHbI<0b11100, 0b00, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm", @@ -2144,6 +2176,7 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), (VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", @@ -2153,6 +2186,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAS : ASbI<0b11100, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", @@ -2166,6 +2200,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAH : AHbI<0b11100, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm", @@ -2196,6 +2231,7 @@ def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)), (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", @@ -2205,6 +2241,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSS : ASbI<0b11100, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", @@ -2217,6 +2254,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSH : AHbI<0b11100, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", @@ -2237,6 +2275,7 @@ def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin), //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. // +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAD : ADbI<0b11101, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm", @@ -2246,6 +2285,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAS : ASbIn<0b11101, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm", @@ -2258,6 +2298,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAH : AHbI<0b11101, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm", @@ -2289,6 +2330,7 @@ def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSD : ADbI<0b11101, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm", @@ -2298,6 +2340,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSS : ASbIn<0b11101, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm", @@ -2310,6 +2353,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSH : AHbI<0b11101, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm", @@ -2341,6 +2385,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAD : ADbI<0b11101, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm", @@ -2350,6 +2395,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAS : ASbI<0b11101, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm", @@ -2362,6 +2408,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAH : AHbI<0b11101, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm", @@ -2400,6 +2447,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin))) (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSD : ADbI<0b11101, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm", @@ -2409,6 +2457,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSS : ASbI<0b11101, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", @@ -2420,6 +2469,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSH : AHbI<0b11101, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm", diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index 5a31b88..de42195 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -177,8 +177,9 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; } // Current Program Status Register. -// We model fpscr with two registers: FPSCR models the control bits and will be -// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV +// We model fpscr with three registers. FPSCR models the control bits and will be +// reserved. FPSCR_RM models rounding mode control bits and will be reserved. +// FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV // models the APSR when it's accessed by some special instructions. In such cases // it has the same encoding as PC. def CPSR : ARMReg<0, "cpsr">; @@ -189,6 +190,9 @@ def FPSCR : ARMReg<3, "fpscr">; def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> { let Aliases = [FPSCR]; } +def FPSCR_RM : ARMReg<3, "fpscr_rm"> { + let Aliases = [FPSCR]; +} def ITSTATE : ARMReg<4, "itstate">; // Special Registers - only available in privileged mode. diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp index b202b202..e3c39a1 100644 --- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp +++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsBPF.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -478,9 +479,95 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I, } } +static Value *wrapPtrIfASNotZero(DenseMap<Value *, Value *> &Cache, + CallInst *CI, Value *P) { + if (auto *PTy = dyn_cast<PointerType>(P->getType())) { + if (PTy->getAddressSpace() == 0) + return P; + } + return aspaceWrapValue(Cache, CI->getFunction(), P); +} + +static Instruction *aspaceMemSet(Intrinsic::ID ID, + DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + if (OldDst == NewDst) + return nullptr; + + // memset(new_dst, val, len, align, isvolatile, md) + Value *Val = CI->getArgOperand(1); + Value *Len = CI->getArgOperand(2); + + auto *MS = cast<MemSetInst>(CI); + MaybeAlign Align = MS->getDestAlign(); + bool IsVolatile = MS->isVolatile(); + + if (ID == Intrinsic::memset) + return B.CreateMemSet(NewDst, Val, Len, Align, IsVolatile, + MI->getAAMetadata()); + else + return B.CreateMemSetInline(NewDst, Align, Val, Len, IsVolatile, + MI->getAAMetadata()); +} + +static Instruction *aspaceMemCpy(Intrinsic::ID ID, + DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *OldSrc = CI->getArgOperand(1); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc); + if (OldDst == NewDst && OldSrc == NewSrc) + return nullptr; + + // memcpy(new_dst, dst_align, new_src, src_align, len, isvolatile, md) + Value *Len = CI->getArgOperand(2); + + auto *MT = cast<MemTransferInst>(CI); + MaybeAlign DstAlign = MT->getDestAlign(); + MaybeAlign SrcAlign = MT->getSourceAlign(); + bool IsVolatile = MT->isVolatile(); + + return B.CreateMemTransferInst(ID, NewDst, DstAlign, NewSrc, SrcAlign, Len, + IsVolatile, MI->getAAMetadata()); +} + +static Instruction *aspaceMemMove(DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *OldSrc = CI->getArgOperand(1); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc); + if (OldDst == NewDst && OldSrc == NewSrc) + return nullptr; + + // memmove(new_dst, dst_align, new_src, src_align, len, isvolatile, md) + Value *Len = CI->getArgOperand(2); + + auto *MT = cast<MemTransferInst>(CI); + MaybeAlign DstAlign = MT->getDestAlign(); + MaybeAlign SrcAlign = MT->getSourceAlign(); + bool IsVolatile = MT->isVolatile(); + + return B.CreateMemMove(NewDst, DstAlign, NewSrc, SrcAlign, Len, IsVolatile, + MI->getAAMetadata()); +} + // Support for BPF address spaces: // - for each function in the module M, update pointer operand of // each memory access instruction (load/store/cmpxchg/atomicrmw) +// or intrinsic call insns (memset/memcpy/memmove) // by casting it from non-zero address space to zero address space, e.g: // // (load (ptr addrspace (N) %p) ...) @@ -493,21 +580,60 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) { for (Function &F : M) { DenseMap<Value *, Value *> CastsCache; for (BasicBlock &BB : F) { - for (Instruction &I : BB) { + for (Instruction &I : llvm::make_early_inc_range(BB)) { unsigned PtrOpNum; - if (auto *LD = dyn_cast<LoadInst>(&I)) + if (auto *LD = dyn_cast<LoadInst>(&I)) { PtrOpNum = LD->getPointerOperandIndex(); - else if (auto *ST = dyn_cast<StoreInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *ST = dyn_cast<StoreInst>(&I)) { PtrOpNum = ST->getPointerOperandIndex(); - else if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) { PtrOpNum = CmpXchg->getPointerOperandIndex(); - else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) { PtrOpNum = RMW->getPointerOperandIndex(); + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + + auto *CI = dyn_cast<CallInst>(&I); + if (!CI) + continue; + + Function *Callee = CI->getCalledFunction(); + if (!Callee || !Callee->isIntrinsic()) + continue; + + // Check memset/memcpy/memmove + Intrinsic::ID ID = Callee->getIntrinsicID(); + bool IsSet = ID == Intrinsic::memset || ID == Intrinsic::memset_inline; + bool IsCpy = ID == Intrinsic::memcpy || ID == Intrinsic::memcpy_inline; + bool IsMove = ID == Intrinsic::memmove; + if (!IsSet && !IsCpy && !IsMove) + continue; + + Instruction *New; + if (IsSet) + New = aspaceMemSet(ID, CastsCache, CI); + else if (IsCpy) + New = aspaceMemCpy(ID, CastsCache, CI); else + New = aspaceMemMove(CastsCache, CI); + + if (!New) continue; - aspaceWrapOperand(CastsCache, &I, PtrOpNum); + I.replaceAllUsesWith(New); + New->takeName(&I); + I.eraseFromParent(); } } Changed |= !CastsCache.empty(); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 34026ed..ecfb5fe 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -439,18 +439,6 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, MF.getFunction().getContext()); - const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); - - std::optional<unsigned> FirstMaskArgument = std::nullopt; - // Preassign the first mask argument. - if (Subtarget.hasVInstructions()) { - for (const auto &ArgIdx : enumerate(Outs)) { - MVT ArgVT = MVT::getVT(ArgIdx.value().Ty); - if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1) - FirstMaskArgument = ArgIdx.index(); - } - } - for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 71c21e4..53633ea 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -675,6 +675,45 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC, CC = getRISCVCCFromICmp(Pred); } +/// Select the RISC-V Zalasr opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. +static unsigned selectZalasrLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + default: + llvm_unreachable("Unexpected memory size"); + case 8: + return IsStore ? RISCV::SB_RL : RISCV::LB_AQ; + case 16: + return IsStore ? RISCV::SH_RL : RISCV::LH_AQ; + case 32: + return IsStore ? RISCV::SW_RL : RISCV::LW_AQ; + case 64: + return IsStore ? RISCV::SD_RL : RISCV::LD_AQ; + } +} + +/// Select the RISC-V regimm opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. \returns \p GenericOpc if the combination is unsupported. +static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + case 8: + // Prefer unsigned due to no c.lb in Zcb. + return IsStore ? RISCV::SB : RISCV::LBU; + case 16: + return IsStore ? RISCV::SH : RISCV::LH; + case 32: + return IsStore ? RISCV::SW : RISCV::LW; + case 64: + return IsStore ? RISCV::SD : RISCV::LD; + } + + return GenericOpc; +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -736,6 +775,62 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { MI.eraseFromParent(); return true; } + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT: { + bool IsSigned = Opc != TargetOpcode::G_ZEXT; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI->getType(SrcReg); + unsigned SrcSize = SrcTy.getSizeInBits(); + + if (SrcTy.isVector()) + return false; // Should be handled by imported patterns. + + assert((*RBI.getRegBank(DstReg, *MRI, TRI)).getID() == + RISCV::GPRBRegBankID && + "Unexpected ext regbank"); + + // Use addiw SrcReg, 0 (sext.w) for i32. + if (IsSigned && SrcSize == 32) { + MI.setDesc(TII.get(RISCV::ADDIW)); + MI.addOperand(MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use add.uw SrcReg, X0 (zext.w) for i32 with Zba. + if (!IsSigned && SrcSize == 32 && STI.hasStdExtZba()) { + MI.setDesc(TII.get(RISCV::ADD_UW)); + MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use sext.h/zext.h for i16 with Zbb. + if (SrcSize == 16 && STI.hasStdExtZbb()) { + MI.setDesc(TII.get(IsSigned ? RISCV::SEXT_H + : STI.isRV64() ? RISCV::ZEXT_H_RV64 + : RISCV::ZEXT_H_RV32)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use pack(w) SrcReg, X0 for i16 zext with Zbkb. + if (!IsSigned && SrcSize == 16 && STI.hasStdExtZbkb()) { + MI.setDesc(TII.get(STI.is64Bit() ? RISCV::PACKW : RISCV::PACK)); + MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Fall back to shift pair. + auto ShiftLeft = + MIB.buildInstr(RISCV::SLLI, {&RISCV::GPRRegClass}, {SrcReg}) + .addImm(STI.getXLen() - SrcSize); + constrainSelectedInstRegOperands(*ShiftLeft, TII, TRI, RBI); + auto ShiftRight = MIB.buildInstr(IsSigned ? RISCV::SRAI : RISCV::SRLI, + {DstReg}, {ShiftLeft}) + .addImm(STI.getXLen() - SrcSize); + constrainSelectedInstRegOperands(*ShiftRight, TII, TRI, RBI); + MI.eraseFromParent(); + return true; + } case TargetOpcode::G_FCONSTANT: { // TODO: Use constant pool for complex constants. Register DstReg = MI.getOperand(0).getReg(); @@ -836,6 +931,59 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return selectImplicitDef(MI, MIB); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(MI, MIB); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + GLoadStore &LdSt = cast<GLoadStore>(MI); + const Register ValReg = LdSt.getReg(0); + const Register PtrReg = LdSt.getPointerReg(); + LLT PtrTy = MRI->getType(PtrReg); + + const RegisterBank &RB = *RBI.getRegBank(ValReg, *MRI, TRI); + if (RB.getID() != RISCV::GPRBRegBankID) + return false; + +#ifndef NDEBUG + const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, *MRI, TRI); + // Check that the pointer register is valid. + assert(PtrRB.getID() == RISCV::GPRBRegBankID && + "Load/Store pointer operand isn't a GPR"); + assert(PtrTy.isPointer() && "Load/Store pointer operand isn't a pointer"); +#endif + + // Can only handle AddressSpace 0. + if (PtrTy.getAddressSpace() != 0) + return false; + + unsigned MemSize = LdSt.getMemSizeInBits().getValue(); + AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); + + if (isStrongerThanMonotonic(Order)) { + MI.setDesc(TII.get(selectZalasrLoadStoreOp(Opc, MemSize))); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + const unsigned NewOpc = selectRegImmLoadStoreOp(MI.getOpcode(), MemSize); + if (NewOpc == MI.getOpcode()) + return false; + + // Check if we can fold anything into the addressing mode. + auto AddrModeFns = selectAddrRegImm(MI.getOperand(1)); + if (!AddrModeFns) + return false; + + // Folded something. Create a new instruction and return it. + auto NewInst = MIB.buildInstr(NewOpc, {}, {}, MI.getFlags()); + if (isa<GStore>(MI)) + NewInst.addUse(ValReg); + else + NewInst.addDef(ValReg); + NewInst.cloneMemRefs(MI); + for (auto &Fn : *AddrModeFns) + Fn(NewInst); + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); + } default: return false; } diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index a537904..5dd4bf4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -166,7 +166,7 @@ static unsigned getLRForRMW32(AtomicOrdering Ordering, return RISCV::LR_W; return RISCV::LR_W_AQ; case AtomicOrdering::SequentiallyConsistent: - return RISCV::LR_W_AQ_RL; + return RISCV::LR_W_AQRL; } } @@ -210,7 +210,7 @@ static unsigned getLRForRMW64(AtomicOrdering Ordering, return RISCV::LR_D; return RISCV::LR_D_AQ; case AtomicOrdering::SequentiallyConsistent: - return RISCV::LR_D_AQ_RL; + return RISCV::LR_D_AQRL; } } @@ -287,8 +287,8 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -375,8 +375,8 @@ static void doMaskedAtomicBinOpExpansion(const RISCVInstrInfo *TII, ScratchReg); BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -535,8 +535,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( // sc.w scratch1, scratch1, (addr) // bnez scratch1, loop BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), Scratch1Reg) - .addReg(AddrReg) - .addReg(Scratch1Reg); + .addReg(Scratch1Reg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(Scratch1Reg) .addReg(RISCV::X0) @@ -674,8 +674,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( // bnez scratch, loophead BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(NewValReg); + .addReg(NewValReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -707,8 +707,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( MaskReg, ScratchReg); BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index a02de31..40c05e8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -265,7 +265,7 @@ def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">, def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">; def FeatureStdExtZalasr - : RISCVExperimentalExtension<0, 1, "Load-Acquire and Store-Release Instructions">; + : RISCVExperimentalExtension<0, 9, "Load-Acquire and Store-Release Instructions">; def HasStdExtZalasr : Predicate<"Subtarget->hasStdExtZalasr()">, AssemblerPredicate<(all_of FeatureStdExtZalasr), "'Zalasr' (Load-Acquire and Store-Release Instructions)">; @@ -1421,7 +1421,7 @@ def HasVendorXMIPSCMov : Predicate<"Subtarget->hasVendorXMIPSCMov()">, AssemblerPredicate<(all_of FeatureVendorXMIPSCMov), "'Xmipscmov' ('mips.ccmov' instruction)">; -def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">; +def UseMIPSCCMovInsn : Predicate<"Subtarget->useMIPSCCMovInsn()">; def FeatureVendorXMIPSLSP : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">; diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 7f5d0af..eba35ef 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -100,93 +100,11 @@ def : LdPat<load, LD, PtrVT>; def : StPat<store, SD, GPR, PtrVT>; } -// Load and store patterns for i16, needed because Zfh makes s16 load/store -// legal and regbank select may not constrain registers to FP. -def : LdPat<load, LH, i16>; -def : StPat<store, SH, GPR, i16>; - -def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb. -def : StPat<truncstorei8, SB, GPR, i16>; - -let Predicates = [HasAtomicLdSt] in { - // Prefer unsigned due to no c.lb in Zcb. - def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>; - def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>; - - def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>; - def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>; -} - -let Predicates = [HasAtomicLdSt, IsRV64] in { - // Load pattern is in RISCVInstrInfoA.td and shared with RV32. - def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>; -} - //===----------------------------------------------------------------------===// // RV64 i32 patterns not used by SelectionDAG //===----------------------------------------------------------------------===// let Predicates = [IsRV64] in { -def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb. -def : LdPat<extloadi16, LH, i32>; - -def : StPat<truncstorei8, SB, GPR, i32>; -def : StPat<truncstorei16, SH, GPR, i32>; - -def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>; - def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32), (ADDIW GPR:$rs1, simm12_lo:$imm)>; } - -let Predicates = [IsRV64, NoStdExtZba] in -def : Pat<(zext (i32 GPR:$src)), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>; - -let Predicates = [IsRV32, NoStdExtZbb, NoStdExtZbkb] in -def : Pat<(XLenVT (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 16)), 16)>; - -let Predicates = [IsRV64, NoStdExtZbb, NoStdExtZbkb] in { -def : Pat<(i64 (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>; -def : Pat<(i32 (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>; -} - -let Predicates = [IsRV32, NoStdExtZbb] in -def : Pat<(XLenVT (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 16)), 16)>; - -let Predicates = [IsRV64, NoStdExtZbb] in { -def : Pat<(i64 (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>; -def : Pat<(i32 (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>; -} - -//===----------------------------------------------------------------------===// -// Zb* RV64 patterns not used by SelectionDAG. -//===----------------------------------------------------------------------===// - -let Predicates = [HasStdExtZba, IsRV64] in { -def : Pat<(zext (i32 GPR:$src)), (ADD_UW GPR:$src, (XLenVT X0))>; -} - -let Predicates = [HasStdExtZbb] in -def : Pat<(i32 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>; -let Predicates = [HasStdExtZbb, IsRV64] in -def : Pat<(i64 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>; - -let Predicates = [HasStdExtZbb, IsRV32] in -def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV32 GPR:$rs)>; -let Predicates = [HasStdExtZbb, IsRV64] in { -def : Pat<(i64 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>; -def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>; -} - -let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32] in -def : Pat<(i32 (zext (i16 GPR:$rs))), (PACK GPR:$rs, (XLenVT X0))>; -let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in { -def : Pat<(i64 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; -def : Pat<(i32 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; -} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dcce2d2..7123a2d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -434,7 +434,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ABS, MVT::i32, Custom); } - if (!Subtarget.useCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov()) + if (!Subtarget.useMIPSCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov()) setOperationAction(ISD::SELECT, XLenVT, Custom); if (Subtarget.hasVendorXqcia() && !Subtarget.is64Bit()) { @@ -15721,8 +15721,7 @@ static SDValue combineAddOfBooleanXor(SDNode *N, SelectionDAG &DAG) { return SDValue(); // Emit a negate of the setcc. - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - N0.getOperand(0)); + return DAG.getNegative(N0.getOperand(0), DL, VT); } static SDValue performADDCombine(SDNode *N, @@ -16498,43 +16497,60 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue X = N->getOperand(0); if (Subtarget.hasShlAdd(3)) { - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 2^N -> shl (shXadd X, X), N - if (isPowerOf2_64(MulAmt2)) { - SDLoc DL(N); - SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the - // shift forming a slli.uw. - if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && - X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), - Shl); - } - // Otherwise, put rhe shl second so that it can fold with following - // instructions (e.g. sext or add). - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(ISD::SHL, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - } - - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT), - Mul359); + int Shift; + if (int ShXAmount = isShifted359(MulAmt, Shift)) { + // 3/5/9 * 2^N -> shl (shXadd X, X), N + SDLoc DL(N); + SDValue X = N->getOperand(0); + // Put the shift first if we can fold a zext into the shift forming + // a slli.uw. + if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && + X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getConstant(ShXAmount, DL, VT), Shl); } + // Otherwise, put the shl second so that it can fold with following + // instructions (e.g. sext or add). + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); + return DAG.getNode(ISD::SHL, DL, VT, Mul359, + DAG.getConstant(Shift, DL, VT)); + } + + // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) + int ShX; + int ShY; + switch (MulAmt) { + case 3 * 5: + ShY = 1; + ShX = 2; + break; + case 3 * 9: + ShY = 1; + ShX = 3; + break; + case 5 * 5: + ShX = ShY = 2; + break; + case 5 * 9: + ShY = 2; + ShX = 3; + break; + case 9 * 9: + ShX = ShY = 3; + break; + default: + ShX = ShY = 0; + break; + } + if (ShX) { + SDLoc DL(N); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(ShX, DL, VT), Mul359); } // If this is a power 2 + 2/4/8, we can use a shift followed by a single @@ -16557,18 +16573,14 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // variants we could implement. e.g. // (2^(1,2,3) * 3,5,9 + 1) << C2 // 2^(C1>3) * 3,5,9 +/- 1 - for (uint64_t Divisor : {3, 5, 9}) { - uint64_t C = MulAmt - 1; - if (C <= Divisor) - continue; - unsigned TZ = llvm::countr_zero(C); - if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) { + if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { + assert(Shift != 0 && "MulAmt=4,6,10 handled before"); + if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(TZ, DL, VT), X); + DAG.getConstant(Shift, DL, VT), X); } } @@ -16576,7 +16588,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); + unsigned ShiftAmt = llvm::countr_zero((MulAmt - 1) & (MulAmt - 2)); SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); @@ -16589,7 +16601,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) for (uint64_t Offset : {3, 5, 9}) { if (isPowerOf2_64(MulAmt + Offset)) { - unsigned ShAmt = Log2_64(MulAmt + Offset); + unsigned ShAmt = llvm::countr_zero(MulAmt + Offset); if (ShAmt >= VT.getSizeInBits()) continue; SDLoc DL(N); @@ -16608,21 +16620,16 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt2 = MulAmt / Divisor; // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples // of 25 which happen to be quite common. - for (uint64_t Divisor2 : {3, 5, 9}) { - if (MulAmt2 % Divisor2 != 0) - continue; - uint64_t MulAmt3 = MulAmt2 / Divisor2; - if (isPowerOf2_64(MulAmt3)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = DAG.getNode( - RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(Log2_64(Divisor2 - 1), DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Log2_64(MulAmt3), DL, VT)); - } + if (int ShBAmount = isShifted359(MulAmt2, Shift)) { + SDLoc DL(N); + SDValue Mul359A = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359B = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, + DAG.getConstant(ShBAmount, DL, VT), Mul359A); + return DAG.getNode(ISD::SHL, DL, VT, Mul359B, + DAG.getConstant(Shift, DL, VT)); } } } @@ -16966,7 +16973,7 @@ performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc)) if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG()) - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + return DAG.getNegative(Src, DL, VT); // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1) if (Opc == ISD::XOR && SrcVT == MVT::i1 && @@ -25031,8 +25038,17 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { // Mark RVV intrinsic as supported. - if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) + if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) { + // GISel doesn't support tuple types yet. + if (Inst.getType()->isRISCVVectorTupleTy()) + return true; + + for (unsigned i = 0; i < II->arg_size(); ++i) + if (II->getArgOperand(i)->getType()->isRISCVVectorTupleTy()) + return true; + return false; + } } if (Inst.getType()->isScalableTy()) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7db4832..96e1078 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4586,24 +4586,23 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(DestReg, RegState::Kill) .addImm(ShiftAmount) .setMIFlag(Flag); - } else if (STI.hasShlAdd(3) && - ((Amount % 3 == 0 && isPowerOf2_64(Amount / 3)) || - (Amount % 5 == 0 && isPowerOf2_64(Amount / 5)) || - (Amount % 9 == 0 && isPowerOf2_64(Amount / 9)))) { + } else if (int ShXAmount, ShiftAmount; + STI.hasShlAdd(3) && + (ShXAmount = isShifted359(Amount, ShiftAmount)) != 0) { // We can use Zba SHXADD+SLLI instructions for multiply in some cases. unsigned Opc; - uint32_t ShiftAmount; - if (Amount % 9 == 0) { - Opc = RISCV::SH3ADD; - ShiftAmount = Log2_64(Amount / 9); - } else if (Amount % 5 == 0) { - Opc = RISCV::SH2ADD; - ShiftAmount = Log2_64(Amount / 5); - } else if (Amount % 3 == 0) { + switch (ShXAmount) { + case 1: Opc = RISCV::SH1ADD; - ShiftAmount = Log2_64(Amount / 3); - } else { - llvm_unreachable("implied by if-clause"); + break; + case 2: + Opc = RISCV::SH2ADD; + break; + case 3: + Opc = RISCV::SH3ADD; + break; + default: + llvm_unreachable("unexpected result of isShifted359"); } if (ShiftAmount) BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 42a0c4c..c5eddb9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -25,6 +25,25 @@ namespace llvm { +// If Value is of the form C1<<C2, where C1 = 3, 5 or 9, +// returns log2(C1 - 1) and assigns Shift = C2. +// Otherwise, returns 0. +template <typename T> int isShifted359(T Value, int &Shift) { + if (Value == 0) + return 0; + Shift = llvm::countr_zero(Value); + switch (Value >> Shift) { + case 3: + return 1; + case 5: + return 2; + case 9: + return 3; + default: + return 0; + } +} + class RISCVSubtarget; static const MachineMemOperand::Flags MONontemporalBit0 = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 9855c47..7a14929 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1980,7 +1980,7 @@ def : LdPat<sextloadi8, LB>; def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb. def : LdPat<sextloadi16, LH>; def : LdPat<extloadi16, LH>; -def : LdPat<load, LW, i32>; +def : LdPat<load, LW, i32>, Requires<[IsRV32]>; def : LdPat<zextloadi8, LBU>; def : LdPat<zextloadi16, LHU>; @@ -1994,7 +1994,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy, def : StPat<truncstorei8, SB, GPR, XLenVT>; def : StPat<truncstorei16, SH, GPR, XLenVT>; -def : StPat<store, SW, GPR, i32>; +def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>; /// Fences diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 25accd9..571d72f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -24,36 +24,36 @@ class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr> } multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> { - def "" : LR_r<0, 0, funct3, opcodestr>; - def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">; - def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : LR_r<1, 1, funct3, opcodestr # ".aqrl">; + def "" : LR_r<0, 0, funct3, opcodestr>; + def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">; + def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : LR_r<1, 1, funct3, opcodestr # ".aqrl">; } let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in class SC_r<bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<0b00011, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> { - def "" : SC_r<0, 0, funct3, opcodestr>; - def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">; - def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : SC_r<1, 1, funct3, opcodestr # ".aqrl">; + def "" : SC_r<0, 0, funct3, opcodestr>; + def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">; + def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : SC_r<1, 1, funct3, opcodestr # ".aqrl">; } let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> { - def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>; - def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">; - def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">; + def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>; + def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">; + def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">; + def _AQRL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">; } //===----------------------------------------------------------------------===// @@ -174,8 +174,9 @@ let Predicates = [HasAtomicLdSt] in { def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>; +} - // Used by GISel for RV32 and RV64. +let Predicates = [HasAtomicLdSt, IsRV32] in { def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>; } @@ -188,31 +189,34 @@ let Predicates = [HasAtomicLdSt, IsRV64] in { /// AMOs +class PatAMO<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT> + : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs2, GPR:$rs1)>; + multiclass AMOPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtA, NoStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst#"_AQ"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst#"_RL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst#"_AQ_RL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst#"_AQ_RL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst#"_AQ"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst#"_RL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; } let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst), vt>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 115ab38e..0b5bee1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -175,7 +175,7 @@ def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd), Sched<[]>; } -let Predicates = [UseCCMovInsn] in { +let Predicates = [UseMIPSCCMovInsn] in { def : Pat<(select (riscv_setne (XLenVT GPR:$rs2)), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)), (MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td index 7cf6d5f..20e2142 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -9,8 +9,8 @@ // This file describes the RISC-V instructions from the standard atomic 'Za*' // extensions: // - Zawrs (v1.0) : Wait-on-Reservation-Set. -// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap. -// - Zabha (v1.0-rc1) : Byte and Halfword Atomic Memory Operations. +// - Zacas (v1.0) : Atomic Compare-and-Swap. +// - Zabha (v1.0) : Byte and Halfword Atomic Memory Operations. // //===----------------------------------------------------------------------===// @@ -44,15 +44,15 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "$rd = $rd_wb" class AMO_cas<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr, DAGOperand RC> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs RC:$rd_wb), (ins RC:$rd, GPRMemZeroOffset:$rs1, RC:$rs2), + (outs RC:$rd_wb), (ins RC:$rd, RC:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_cas_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr, DAGOperand RC> { - def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>; - def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>; - def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>; - def _AQ_RL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>; + def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>; + def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>; + def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>; + def _AQRL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>; } let Predicates = [HasStdExtZacas], IsSignExtendingOpW = 1 in { @@ -71,48 +71,48 @@ defm AMOCAS_Q : AMO_cas_aq_rl<0b00101, 0b100, "amocas.q", GPRPairRV64>; multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) let Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) } @@ -140,7 +140,7 @@ def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>; // Zabha (Byte and Halfword Atomic Memory Operations) //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZabha] in { +let Predicates = [HasStdExtZabha], IsSignExtendingOpW = 1 in { defm AMOSWAP_B : AMO_rr_aq_rl<0b00001, 0b000, "amoswap.b">, Sched<[WriteAtomicB, ReadAtomicBA, ReadAtomicBD]>; defm AMOADD_B : AMO_rr_aq_rl<0b00000, 0b000, "amoadd.b">, @@ -181,7 +181,7 @@ defm AMOMAXU_H : AMO_rr_aq_rl<0b11100, 0b001, "amomaxu.h">, } // If Zacas extension is also implemented, Zabha further provides AMOCAS.[B|H]. -let Predicates = [HasStdExtZabha, HasStdExtZacas] in { +let Predicates = [HasStdExtZabha, HasStdExtZacas], IsSignExtendingOpW = 1 in { defm AMOCAS_B : AMO_cas_aq_rl<0b00101, 0b000, "amocas.b", GPR>; defm AMOCAS_H : AMO_cas_aq_rl<0b00101, 0b001, "amocas.h", GPR>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td index 1dd7332..5f944034 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td @@ -30,21 +30,22 @@ class SRL_r<bit aq, bit rl, bits<3> funct3, string opcodestr> opcodestr, "$rs2, $rs1"> { let rd = 0; } + multiclass LAQ_r_aq_rl<bits<3> funct3, string opcodestr> { - def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">; - def _AQ_RL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">; + def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">; + def _AQRL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">; } multiclass SRL_r_aq_rl<bits<3> funct3, string opcodestr> { - def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">; + def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">; } //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZalasr] in { +let Predicates = [HasStdExtZalasr], IsSignExtendingOpW = 1 in { defm LB : LAQ_r_aq_rl<0b000, "lb">; defm LH : LAQ_r_aq_rl<0b001, "lh">; defm LW : LAQ_r_aq_rl<0b010, "lw">; @@ -93,11 +94,11 @@ let Predicates = [HasStdExtZalasr] in { def : PatSRL<releasing_store<atomic_store_32>, SW_RL>; def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>; -} // Predicates = [HasStdExtZalasr] +} let Predicates = [HasStdExtZalasr, IsRV32] in { - def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ>; - def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ>; + def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>; + def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>; } // Predicates = [HasStdExtZalasr, IsRV32] let Predicates = [HasStdExtZalasr, IsRV64] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index ce21d83..8d9b777 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -808,9 +808,9 @@ multiclass Sh2Add_UWPat<Instruction sh2add_uw> { } multiclass Sh3Add_UWPat<Instruction sh3add_uw> { - def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0xFFFFFFF8), + def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), (XLenVT GPR:$rs2))), - (sh3add_uw (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>; + (sh3add_uw GPR:$rs1, GPR:$rs2)>; // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift. def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))), diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp index c81a20b..115a96e 100644 --- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp @@ -92,7 +92,7 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; const RISCVSubtarget &Subtarget = Fn.getSubtarget<RISCVSubtarget>(); - if (!Subtarget.useLoadStorePairs()) + if (!Subtarget.useMIPSLoadStorePairs()) return false; bool MadeChange = false; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3f2e7db..3e07eff 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -567,9 +567,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX8", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX8", [VCQ, VL], mx, IsWorstCase>; @@ -587,9 +590,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX16", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX16", [VCQ, VL], mx, IsWorstCase>; @@ -604,9 +610,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX32", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX32", [VCQ, VL], mx, IsWorstCase>; @@ -621,9 +630,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX64", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX64", [VCQ, VL], mx, IsWorstCase>; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 6c7658c..01a4308 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -67,42 +67,41 @@ multiclass LMULSEWWriteResMXSEW<string name, list<ProcResourceKind> resources, // ReleaseAtCycles predCycles if the SchedPredicate Pred is true, otherwise has // Latency noPredLat and ReleaseAtCycles noPredCycles. The WorstCase SchedWrite // is created similarly if IsWorstCase is true. -multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, - list<ProcResourceKind> resources, - int predLat, list<int> predAcquireCycles, - list<int> predReleaseCycles, int noPredLat, - list<int> noPredAcquireCycles, - list<int> noPredReleaseCycles, - string mx, bit IsWorstCase> { - defvar nameMX = name # "_" # mx; - +multiclass LMULWriteResVariantImpl<string name, string writeResName, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + bit IsWorstCase> { // Define the different behaviors - def nameMX # "_Pred" : SchedWriteRes<resources>{ + def writeResName # "_Pred" : SchedWriteRes<predResources>{ let Latency = predLat; let AcquireAtCycles = predAcquireCycles; let ReleaseAtCycles = predReleaseCycles; } - def nameMX # "_NoPred" : SchedWriteRes<resources> { + def writeResName # "_NoPred" : SchedWriteRes<noPredResources> { let Latency = noPredLat; let AcquireAtCycles = noPredAcquireCycles; let ReleaseAtCycles = noPredReleaseCycles; } // Define SchedVars - def nameMX # PredSchedVar - : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>; - def nameMX # NoPredSchedVar - : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>; + def writeResName # PredSchedVar + : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # writeResName # "_Pred")]>; + def writeResName # NoPredSchedVar + : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # writeResName #"_NoPred")]>; // Allow multiclass to refer to SchedVars -- need to have NAME prefix. - defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar); - defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar); + defvar PredSchedVar = !cast<SchedVar>(NAME # writeResName # PredSchedVar); + defvar NoPredSchedVar = !cast<SchedVar>(NAME # writeResName # NoPredSchedVar); // Tie behavior to predicate - def NAME # nameMX # "_Variant" + def NAME # writeResName # "_Variant" : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< - !cast<SchedReadWrite>(nameMX), - !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>; + !cast<SchedReadWrite>(writeResName), + !cast<SchedReadWrite>(NAME # writeResName # "_Variant")>; if IsWorstCase then { def NAME # name # "_WorstCase_Variant" @@ -113,6 +112,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, } } +multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + string mx, bit IsWorstCase> { + defm "" : LMULWriteResVariantImpl<name, name # "_" # mx, Pred, predResources, + predLat, predAcquireCycles, + predReleaseCycles, noPredResources, + noPredLat, noPredAcquireCycles, + noPredReleaseCycles, + IsWorstCase>; +} + // Define multiclasses to define SchedWrite, SchedRead, WriteRes, and // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the // SchedMxList variants above. Each multiclass is responsible for defining diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index e35ffaf..715ac4c 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -65,9 +65,9 @@ static cl::opt<bool> UseMIPSLoadStorePairsOpt( cl::desc("Enable the load/store pair optimization pass"), cl::init(false), cl::Hidden); -static cl::opt<bool> UseCCMovInsn("use-riscv-ccmov", - cl::desc("Use 'mips.ccmov' instruction"), - cl::init(true), cl::Hidden); +static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov", + cl::desc("Use 'mips.ccmov' instruction"), + cl::init(true), cl::Hidden); void RISCVSubtarget::anchor() {} @@ -246,10 +246,10 @@ void RISCVSubtarget::overridePostRASchedPolicy( } } -bool RISCVSubtarget::useLoadStorePairs() const { +bool RISCVSubtarget::useMIPSLoadStorePairs() const { return UseMIPSLoadStorePairsOpt && HasVendorXMIPSLSP; } -bool RISCVSubtarget::useCCMovInsn() const { - return UseCCMovInsn && HasVendorXMIPSCMov; +bool RISCVSubtarget::useMIPSCCMovInsn() const { + return UseMIPSCCMovInsn && HasVendorXMIPSCMov; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 7dffa63..6acf799 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -227,8 +227,8 @@ public: unsigned getXLen() const { return is64Bit() ? 64 : 32; } - bool useLoadStorePairs() const; - bool useCCMovInsn() const; + bool useMIPSLoadStorePairs() const; + bool useMIPSCCMovInsn() const; unsigned getFLen() const { if (HasStdExtD) return 64; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ee25f69..7bc0b5b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2747,20 +2747,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Intrinsic::ID IID = Inst->getIntrinsicID(); LLVMContext &C = Inst->getContext(); bool HasMask = false; + + auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo, + bool IsWrite) -> int64_t { + if (auto *TarExtTy = + dyn_cast<TargetExtType>(II->getArgOperand(0)->getType())) + return TarExtTy->getIntParameter(0); + + return 1; + }; + switch (IID) { case Intrinsic::riscv_vle_mask: case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vle: - case Intrinsic::riscv_vse: { + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vlseg2: + case Intrinsic::riscv_vlseg3: + case Intrinsic::riscv_vlseg4: + case Intrinsic::riscv_vlseg5: + case Intrinsic::riscv_vlseg6: + case Intrinsic::riscv_vlseg7: + case Intrinsic::riscv_vlseg8: + case Intrinsic::riscv_vsseg2: + case Intrinsic::riscv_vsseg3: + case Intrinsic::riscv_vsseg4: + case Intrinsic::riscv_vsseg5: + case Intrinsic::riscv_vsseg6: + case Intrinsic::riscv_vsseg7: + case Intrinsic::riscv_vsseg8: { // Intrinsic interface: // riscv_vle(merge, ptr, vl) // riscv_vle_mask(merge, ptr, mask, vl, policy) // riscv_vse(val, ptr, vl) // riscv_vse_mask(val, ptr, mask, vl, policy) + // riscv_vlseg#(merge, ptr, vl, sew) + // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew) + // riscv_vsseg#(val, ptr, vl, sew) + // riscv_vsseg#_mask(val, ptr, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 1 - HasMask; @@ -2771,23 +2823,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL); return true; } case Intrinsic::riscv_vlse_mask: case Intrinsic::riscv_vsse_mask: + case Intrinsic::riscv_vlsseg2_mask: + case Intrinsic::riscv_vlsseg3_mask: + case Intrinsic::riscv_vlsseg4_mask: + case Intrinsic::riscv_vlsseg5_mask: + case Intrinsic::riscv_vlsseg6_mask: + case Intrinsic::riscv_vlsseg7_mask: + case Intrinsic::riscv_vlsseg8_mask: + case Intrinsic::riscv_vssseg2_mask: + case Intrinsic::riscv_vssseg3_mask: + case Intrinsic::riscv_vssseg4_mask: + case Intrinsic::riscv_vssseg5_mask: + case Intrinsic::riscv_vssseg6_mask: + case Intrinsic::riscv_vssseg7_mask: + case Intrinsic::riscv_vssseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vlse: - case Intrinsic::riscv_vsse: { + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vlsseg2: + case Intrinsic::riscv_vlsseg3: + case Intrinsic::riscv_vlsseg4: + case Intrinsic::riscv_vlsseg5: + case Intrinsic::riscv_vlsseg6: + case Intrinsic::riscv_vlsseg7: + case Intrinsic::riscv_vlsseg8: + case Intrinsic::riscv_vssseg2: + case Intrinsic::riscv_vssseg3: + case Intrinsic::riscv_vssseg4: + case Intrinsic::riscv_vssseg5: + case Intrinsic::riscv_vssseg6: + case Intrinsic::riscv_vssseg7: + case Intrinsic::riscv_vssseg8: { // Intrinsic interface: // riscv_vlse(merge, ptr, stride, vl) // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy) // riscv_vsse(val, ptr, stride, vl) // riscv_vsse_mask(val, ptr, stride, mask, vl, policy) + // riscv_vlsseg#(merge, ptr, offset, vl, sew) + // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew) + // riscv_vssseg#(val, ptr, offset, vl, sew) + // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2809,6 +2910,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL, Stride); return true; @@ -2817,19 +2925,89 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::riscv_vluxei_mask: case Intrinsic::riscv_vsoxei_mask: case Intrinsic::riscv_vsuxei_mask: + case Intrinsic::riscv_vloxseg2_mask: + case Intrinsic::riscv_vloxseg3_mask: + case Intrinsic::riscv_vloxseg4_mask: + case Intrinsic::riscv_vloxseg5_mask: + case Intrinsic::riscv_vloxseg6_mask: + case Intrinsic::riscv_vloxseg7_mask: + case Intrinsic::riscv_vloxseg8_mask: + case Intrinsic::riscv_vluxseg2_mask: + case Intrinsic::riscv_vluxseg3_mask: + case Intrinsic::riscv_vluxseg4_mask: + case Intrinsic::riscv_vluxseg5_mask: + case Intrinsic::riscv_vluxseg6_mask: + case Intrinsic::riscv_vluxseg7_mask: + case Intrinsic::riscv_vluxseg8_mask: + case Intrinsic::riscv_vsoxseg2_mask: + case Intrinsic::riscv_vsoxseg3_mask: + case Intrinsic::riscv_vsoxseg4_mask: + case Intrinsic::riscv_vsoxseg5_mask: + case Intrinsic::riscv_vsoxseg6_mask: + case Intrinsic::riscv_vsoxseg7_mask: + case Intrinsic::riscv_vsoxseg8_mask: + case Intrinsic::riscv_vsuxseg2_mask: + case Intrinsic::riscv_vsuxseg3_mask: + case Intrinsic::riscv_vsuxseg4_mask: + case Intrinsic::riscv_vsuxseg5_mask: + case Intrinsic::riscv_vsuxseg6_mask: + case Intrinsic::riscv_vsuxseg7_mask: + case Intrinsic::riscv_vsuxseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vloxei: case Intrinsic::riscv_vluxei: case Intrinsic::riscv_vsoxei: - case Intrinsic::riscv_vsuxei: { + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vloxseg2: + case Intrinsic::riscv_vloxseg3: + case Intrinsic::riscv_vloxseg4: + case Intrinsic::riscv_vloxseg5: + case Intrinsic::riscv_vloxseg6: + case Intrinsic::riscv_vloxseg7: + case Intrinsic::riscv_vloxseg8: + case Intrinsic::riscv_vluxseg2: + case Intrinsic::riscv_vluxseg3: + case Intrinsic::riscv_vluxseg4: + case Intrinsic::riscv_vluxseg5: + case Intrinsic::riscv_vluxseg6: + case Intrinsic::riscv_vluxseg7: + case Intrinsic::riscv_vluxseg8: + case Intrinsic::riscv_vsoxseg2: + case Intrinsic::riscv_vsoxseg3: + case Intrinsic::riscv_vsoxseg4: + case Intrinsic::riscv_vsoxseg5: + case Intrinsic::riscv_vsoxseg6: + case Intrinsic::riscv_vsoxseg7: + case Intrinsic::riscv_vsoxseg8: + case Intrinsic::riscv_vsuxseg2: + case Intrinsic::riscv_vsuxseg3: + case Intrinsic::riscv_vsuxseg4: + case Intrinsic::riscv_vsuxseg5: + case Intrinsic::riscv_vsuxseg6: + case Intrinsic::riscv_vsuxseg7: + case Intrinsic::riscv_vsuxseg8: { // Intrinsic interface (only listed ordered version): // riscv_vloxei(merge, ptr, index, vl) // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy) // riscv_vsoxei(val, ptr, index, vl) // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy) + // riscv_vloxseg#(merge, ptr, index, vl, sew) + // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew) + // riscv_vsoxseg#(val, ptr, index, vl, sew) + // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2845,6 +3023,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Mask = ConstantInt::getTrue(MaskType); } Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1); Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Align(1), Mask, EVL, diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index 8e7e2e5..f1d487c87 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -326,6 +326,15 @@ void SparcAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) { Sparc_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + emitInstruction(&*I); + ++I; + } + return; + } switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index a160709..cbb7db6 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -3557,3 +3558,28 @@ void SparcTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (!Node->hasAnyUseOfValue(0)) MI.getOperand(0).setReg(SP::G0); } + +Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + bool HasStoreSemantics = + isa<AtomicCmpXchgInst, AtomicRMWInst, StoreInst>(Inst); + if (HasStoreSemantics && isReleaseOrStronger(Ord)) + return Builder.CreateFence(AtomicOrdering::Release); + return nullptr; +} + +Instruction *SparcTargetLowering::emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + // V8 loads already come with implicit acquire barrier so there's no need to + // emit it again. + bool HasLoadSemantics = isa<AtomicCmpXchgInst, AtomicRMWInst, LoadInst>(Inst); + if (Subtarget->isV9() && HasLoadSemantics && isAcquireOrStronger(Ord)) + return Builder.CreateFence(AtomicOrdering::Acquire); + + // SC plain stores would need a trailing full barrier. + if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) + return Builder.CreateFence(Ord); + return nullptr; +} diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index e7040f7..f3efd94 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -183,6 +183,11 @@ namespace llvm { bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override { // FIXME: We insert fences for each atomics and generate // sub-optimal code for PSO/TSO. (Approximately nobody uses any diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index e28f445..f66eb9d 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -653,6 +653,23 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addImm(Offset); return true; } + case SP::V8BAR: { + assert(!Subtarget.isV9() && + "V8BAR should not be emitted on V9 processors!"); + + // Emit stbar; ldstub [%sp-1], %g0 + // The sequence acts as a full barrier on V8 systems. + MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr &InstSTBAR = + *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::STBAR)); + MachineInstr &InstLDSTUB = + *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::LDSTUBri), SP::G0) + .addReg(SP::O6) + .addImm(-1); + MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB); + MBB.erase(MI); + return true; + } } return false; } diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index 97e7fd7..bc192c2 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -578,6 +578,9 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> let isPseudo = 1; } +// Full memory barrier for V8. +def V8BAR : Pseudo<(outs), (ins), "!V8BAR", []>, Requires<[HasNoV9]>; + // GETPCX for PIC let Defs = [O7] in { def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >; @@ -1974,12 +1977,30 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>; def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>; def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>; -// store bar for all atomic_fence in V8. -let Predicates = [HasNoV9] in - def : Pat<(atomic_fence timm, timm), (STBAR)>; +// All load-type operations in V8 comes with implicit acquire semantics. +let Predicates = [HasNoV9] in { + // Acquire -> nop + def : Pat<(atomic_fence (i32 4), timm), (NOP)>; + // Release / AcqRel -> stbar + def : Pat<(atomic_fence (i32 5), timm), (STBAR)>; + // AcqRel and stronger -> stbar; ldstub [%sp-1], %g0 + def : Pat<(atomic_fence timm, timm), (V8BAR)>; +} -let Predicates = [HasV9] in +// We have to handle both 32 and 64-bit cases. +let Predicates = [HasV9] in { + // Acquire -> membar #LoadLoad | #LoadStore + def : Pat<(atomic_fence (i32 4), timm), (MEMBARi 0x5)>; + def : Pat<(atomic_fence (i64 4), timm), (MEMBARi 0x5)>; + // Release -> membar #LoadStore | #StoreStore + def : Pat<(atomic_fence (i32 5), timm), (MEMBARi 0xc)>; + def : Pat<(atomic_fence (i64 5), timm), (MEMBARi 0xc)>; + // AcqRel -> membar #LoadLoad | #LoadStore | #StoreStore + def : Pat<(atomic_fence (i32 6), timm), (MEMBARi 0xd)>; + def : Pat<(atomic_fence (i64 6), timm), (MEMBARi 0xd)>; + // SeqCst -> membar #StoreLoad | #LoadLoad | #LoadStore | #StoreStore def : Pat<(atomic_fence timm, timm), (MEMBARi 0xf)>; +} // atomic_load addr -> load addr def : Pat<(i32 (atomic_load_azext_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 163bf9b..6472334 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3209,7 +3209,8 @@ static SDValue performAnyAllCombine(SDNode *N, SelectionDAG &DAG) { using namespace llvm::SDPatternMatch; SDValue LHS; - if (!sd_match(N->getOperand(1), + if (N->getNumOperands() < 2 || + !sd_match(N->getOperand(1), m_c_SetCC(m_Value(LHS), m_Zero(), m_CondCode()))) return SDValue(); EVT LT = LHS.getValueType(); |