diff options
Diffstat (limited to 'llvm/lib/Target')
56 files changed, 1564 insertions, 819 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 076a623..639ddcb 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -69,7 +69,6 @@ def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, - fconstant_to_constant, icmp_redundant_trunc, fold_global_offset, shuffle_to_extract, @@ -341,7 +340,7 @@ def AArch64PostLegalizerLowering : GICombiner<"AArch64PostLegalizerLoweringImpl", [shuffle_vector_lowering, vashr_vlshr_imm, icmp_lowering, build_vector_lowering, - lower_vector_fcmp, form_truncstore, + lower_vector_fcmp, form_truncstore, fconstant_to_constant, vector_sext_inreg_to_shift, unmerge_ext_to_unmerge, lower_mulv2s64, vector_unmerge_lowering, insertelt_nonconst, diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 0f4bbfc3..1e607f4 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -92,9 +92,18 @@ private: bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - MachineBasicBlock * - expandCommitOrRestoreZASave(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + struct ConditionalBlocks { + MachineBasicBlock &CondBB; + MachineBasicBlock &EndBB; + }; + ConditionalBlocks expandConditionalPseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + MachineInstrBuilder &Branch); + MachineBasicBlock *expandRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + MachineBasicBlock *expandCommitZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; @@ -991,72 +1000,97 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext( return true; } -static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; - -MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { - MachineInstr &MI = *MBBI; - bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo; - assert((MI.getOpcode() == AArch64::RestoreZAPseudo || - MI.getOpcode() == AArch64::CommitZASavePseudo) && - "Expected ZA commit or restore"); +AArch64ExpandPseudo::ConditionalBlocks +AArch64ExpandPseudo::expandConditionalPseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + MachineInstrBuilder &Branch) { assert((std::next(MBBI) != MBB.end() || - MI.getParent()->successors().begin() != - MI.getParent()->successors().end()) && - "Unexpected unreachable in block that restores ZA"); - - // Compare TPIDR2_EL0 value against 0. - DebugLoc DL = MI.getDebugLoc(); - MachineInstrBuilder Branch = - BuildMI(MBB, MBBI, DL, - TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX)) - .add(MI.getOperand(0)); + MBB.successors().begin() != MBB.successors().end()) && + "Unexpected unreachable in block"); // Split MBB and create two new blocks: - // - MBB now contains all instructions before RestoreZAPseudo. - // - SMBB contains the [Commit|RestoreZA]Pseudo instruction only. - // - EndBB contains all instructions after [Commit|RestoreZA]Pseudo. + // - MBB now contains all instructions before the conditional pseudo. + // - CondBB contains the conditional pseudo instruction only. + // - EndBB contains all instructions after the conditional pseudo. MachineInstr &PrevMI = *std::prev(MBBI); - MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); - MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() - ? *SMBB->successors().begin() - : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - - // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB. - Branch.addMBB(SMBB); + MachineBasicBlock *CondBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); + MachineBasicBlock *EndBB = + std::next(MBBI) == CondBB->end() + ? *CondBB->successors().begin() + : CondBB->splitAt(*MBBI, /*UpdateLiveIns*/ true); + + // Add the SMBB label to the branch instruction & create a branch to EndBB. + Branch.addMBB(CondBB); BuildMI(&MBB, DL, TII->get(AArch64::B)) .addMBB(EndBB); MBB.addSuccessor(EndBB); + // Create branch from CondBB to EndBB. Users of this helper should insert new + // instructions at CondBB.back() -- i.e. before the branch. + BuildMI(CondBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + return {*CondBB, *EndBB}; +} + +MachineBasicBlock * +AArch64ExpandPseudo::expandRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + // Compare TPIDR2_EL0 against 0. Restore ZA if TPIDR2_EL0 is zero. + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX)).add(MI.getOperand(0)); + + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch); // Replace the pseudo with a call (BL). MachineInstrBuilder MIB = - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL)); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); // Copy operands (mainly the regmask) from the pseudo. for (unsigned I = 2; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); + // Mark the TPIDR2 block pointer (X0) as an implicit use. + MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); - if (IsRestoreZA) { - // Mark the TPIDR2 block pointer (X0) as an implicit use. - MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); - } else /*CommitZA*/ { + MI.eraseFromParent(); + return &EndBB; +} + +static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; + +MachineBasicBlock * +AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + // Compare TPIDR2_EL0 against 0. Commit ZA if TPIDR2_EL0 is non-zero. + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBNZX)).add(MI.getOperand(0)); + + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch); + // Replace the pseudo with a call (BL). + MachineInstrBuilder MIB = + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); + // Copy operands (mainly the regmask) from the pseudo. + for (unsigned I = 2; I < MI.getNumOperands(); ++I) + MIB.add(MI.getOperand(I)); + // Clear TPIDR2_EL0. + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + bool ZeroZA = MI.getOperand(1).getImm() != 0; + if (ZeroZA) { [[maybe_unused]] auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); - // Clear TPIDR2_EL0. - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR)) - .addImm(AArch64SysReg::TPIDR2_EL0) - .addReg(AArch64::XZR); - bool ZeroZA = MI.getOperand(1).getImm() != 0; - if (ZeroZA) { - assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M)) - .addImm(ZERO_ALL_ZA_MASK) - .addDef(AArch64::ZAB0, RegState::ImplicitDefine); - } + assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_M)) + .addImm(ZERO_ALL_ZA_MASK) + .addDef(AArch64::ZAB0, RegState::ImplicitDefine); } - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MI.eraseFromParent(); - return EndBB; + return &EndBB; } MachineBasicBlock * @@ -1130,24 +1164,9 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB, MachineInstrBuilder Tbx = BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0); - // Split MBB and create two new blocks: - // - MBB now contains all instructions before MSRcond_pstatesvcrImm1. - // - SMBB contains the MSRcond_pstatesvcrImm1 instruction only. - // - EndBB contains all instructions after MSRcond_pstatesvcrImm1. - MachineInstr &PrevMI = *std::prev(MBBI); - MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); - MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() - ? *SMBB->successors().begin() - : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - - // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB. - Tbx.addMBB(SMBB); - BuildMI(&MBB, DL, TII->get(AArch64::B)) - .addMBB(EndBB); - MBB.addSuccessor(EndBB); - + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Tbx); // Create the SMSTART/SMSTOP (MSRpstatesvcrImm1) instruction in SMBB. - MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->begin(), MI.getDebugLoc(), + MachineInstrBuilder MIB = BuildMI(CondBB, CondBB.back(), MI.getDebugLoc(), TII->get(AArch64::MSRpstatesvcrImm1)); // Copy all but the second and third operands of MSRcond_pstatesvcrImm1 (as // these contain the CopyFromReg for the first argument and the flag to @@ -1157,10 +1176,8 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB, for (unsigned i = 4; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); - MI.eraseFromParent(); - return EndBB; + return &EndBB; } bool AArch64ExpandPseudo::expandMultiVecPseudo( @@ -1674,15 +1691,21 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandCALL_BTI(MBB, MBBI); case AArch64::StoreSwiftAsyncContext: return expandStoreSwiftAsyncContext(MBB, MBBI); + case AArch64::RestoreZAPseudo: case AArch64::CommitZASavePseudo: - case AArch64::RestoreZAPseudo: { - auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI); - if (NewMBB != &MBB) - NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. - return true; - } case AArch64::MSRpstatePseudo: { - auto *NewMBB = expandCondSMToggle(MBB, MBBI); + auto *NewMBB = [&] { + switch (Opcode) { + case AArch64::RestoreZAPseudo: + return expandRestoreZASave(MBB, MBBI); + case AArch64::CommitZASavePseudo: + return expandCommitZASave(MBB, MBBI); + case AArch64::MSRpstatePseudo: + return expandCondSMToggle(MBB, MBBI); + default: + llvm_unreachable("Unexpected conditional pseudo!"); + } + }(); if (NewMBB != &MBB) NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index c197550e..9e2d698 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64); getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({s32, s64, s128}) - .legalFor(HasFP16, {s16}) + // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT + .legalFor({s16, s32, s64, s128}) .clampScalar(0, MinFPScalar, s128); // FIXME: fix moreElementsToNextPow2 diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 63313da..23dcaea 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -75,6 +75,31 @@ struct ShuffleVectorPseudo { ShuffleVectorPseudo() = default; }; +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 16 && DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector /// sources of the shuffle are different. std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 8c10673..896eab5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -44,31 +44,6 @@ namespace { #include "AArch64GenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES -/// Return true if a G_FCONSTANT instruction is known to be better-represented -/// as a G_CONSTANT. -bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - Register DstReg = MI.getOperand(0).getReg(); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - if (DstSize != 32 && DstSize != 64) - return false; - - // When we're storing a value, it doesn't matter what register bank it's on. - // Since not all floating point constants can be materialized using a fmov, - // it makes more sense to just use a GPR. - return all_of(MRI.use_nodbg_instructions(DstReg), - [](const MachineInstr &Use) { return Use.mayStore(); }); -} - -/// Change a G_FCONSTANT into a G_CONSTANT. -void applyFConstantToConstant(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - MachineIRBuilder MIB(MI); - const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); - MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); - MI.eraseFromParent(); -} - /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits /// are sign bits. In this case, we can transform the G_ICMP to directly compare /// the wide value with a zero. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index f90bcc7..830a35bb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -590,6 +590,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, unsigned Depth) const { switch (MI.getOpcode()) { case AArch64::G_DUP: + case AArch64::G_SADDLP: + case AArch64::G_UADDLP: case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: case TargetOpcode::G_EXTRACT_VECTOR_ELT: @@ -798,6 +800,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (Ty.isVector()) OpRegBankIdx[Idx] = PMI_FirstFPR; else if (isPreISelGenericFloatingPointOpcode(Opc) || + (MO.isDef() && onlyDefinesFP(MI, MRI, TRI)) || + (MO.isUse() && onlyUsesFP(MI, MRI, TRI)) || Ty.getSizeInBits() > 64) OpRegBankIdx[Idx] = PMI_FirstFPR; else diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1a697f7..ea32748 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2592,6 +2592,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " + "!Subtarget->d16PreservesUnusedBits()">; + def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, @@ -2769,6 +2772,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, AssemblerPredicate<(all_of FeatureMAIInsts)>; +def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">, + AssemblerPredicate<(all_of (not FeatureMAIInsts))>; + def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">, AssemblerPredicate<(all_of FeatureSMemRealTime)>; @@ -2943,6 +2949,20 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic( def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; +def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; + +//===----------------------------------------------------------------------===// +// HwModes +//===----------------------------------------------------------------------===// + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; + +// gfx1250, has alignment requirement but no AGPRs. +def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; + + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936..ef58004 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + // TODO: Overestimates due to not accounting for tied operands + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast<StructType>(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + unsigned RegCount = 0; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') { + RegCount = NumRegs; + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } + + continue; + } + + if (CI.Type == InlineAsm::isOutput) { + // Apply tuple alignment requirement + // + // TODO: This is more conservative than necessary. + AGPRDefCount = alignTo(AGPRDefCount, RegCount); + + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } + } else { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + + // TODO: This is overly conservative. If there are any physical registers, + // allocate any virtual registers after them so we don't have to solve optimal + // packing. + return std::min(MaxVirtReg + MaxPhysReg, 256u); } // TODO: Migrate to range merge of amdgpu-agpr-alloc. @@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { const Function *Callee = dyn_cast<Function>(CalleeOp); if (!Callee) { if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) - return !inlineAsmUsesAGPRs(IA); + return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; return false; } - // Some intrinsics may use AGPRs, but if we have a choice, we are not - // required to use AGPRs. - if (Callee->isIntrinsic()) + switch (Callee->getIntrinsicID()) { + case Intrinsic::not_intrinsic: + break; + case Intrinsic::write_register: + case Intrinsic::read_register: + case Intrinsic::read_volatile_register: { + const MDString *RegName = cast<MDString>( + cast<MDNode>( + cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata()) + ->getOperand(0)); + auto [Kind, RegIdx, NumRegs] = + AMDGPU::parseAsmPhysRegName(RegName->getString()); + return Kind != 'a'; + } + default: + // Some intrinsics may use AGPRs, but if we have a choice, we are not + // required to use AGPRs. return true; + } // TODO: Handle callsite attributes const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( @@ -1504,7 +1584,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); @@ -1515,6 +1594,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!F->isDeclaration() && ST.hasClusters()) A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); + if (ST.hasGFX90AInsts()) + A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + for (auto &I : instructions(F)) { Value *Ptr = nullptr; if (auto *LI = dyn_cast<LoadInst>(&I)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2192a72..e4d328a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -393,12 +393,13 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { - const MCInstrDesc &Desc = - Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + const MCInstrDesc &Desc = TII->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; - int RegClass = Desc.operands()[OpIdx].RegClass; + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]); if (RegClass == -1) return nullptr; @@ -4353,7 +4354,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (!RC || SIRI->isSGPRClass(RC)) return false; - if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { + if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass && + RC != &AMDGPU::VS_64_Align2RegClass) { AllUsesAcceptSReg = false; SDNode *User = U->getUser(); if (User->isMachineOpcode()) { @@ -4367,7 +4369,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { const TargetRegisterClass *CommutedRC = getOperandRegClass(U->getUser(), CommutedOpNo); if (CommutedRC == &AMDGPU::VS_32RegClass || - CommutedRC == &AMDGPU::VS_64RegClass) + CommutedRC == &AMDGPU::VS_64RegClass || + CommutedRC == &AMDGPU::VS_64_Align2RegClass) AllUsesAcceptSReg = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe2..c7a91f4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,8 +929,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } } }); @@ -964,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } - if (EnableAMDGPUAttributor) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; @@ -1296,7 +1298,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d0c0822..99ba043 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1347,6 +1347,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ForcedDPP = false; bool ForcedSDWA = false; KernelScopeInfo KernelScope; + const unsigned HwMode; /// @name Auto-generated Match Functions /// { @@ -1356,6 +1357,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } + /// Get size of register operand + unsigned getRegOperandSize(const MCInstrDesc &Desc, unsigned OpNo) const { + assert(OpNo < Desc.NumOperands); + int16_t RCID = MII.getOpRegClassID(Desc.operands()[OpNo], HwMode); + return getRegBitWidth(RCID) / 8; + } + private: void createConstantSymbol(StringRef Id, int64_t Val); @@ -1442,9 +1450,9 @@ public: using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>; AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) { + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI, MII), Parser(_Parser), + HwMode(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)) { MCAsmParserExtension::Initialize(Parser); setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); @@ -2097,6 +2105,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { // Only plain immediates are inlinable (e.g. "clamp" attribute is not) return false; } + + if (getModifiers().Lit != LitModifier::None) + return false; + // TODO: We should avoid using host float here. It would be better to // check the float bit values which is what a few other places do. // We've had bot failures before due to weird NaN support on mips hosts. @@ -2331,6 +2343,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo bool CanUse64BitLiterals = AsmParser->has64BitLiterals() && !(InstDesc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); + LitModifier Lit = getModifiers().Lit; MCContext &Ctx = AsmParser->getContext(); if (Imm.IsFPImm) { // We got fp literal token @@ -2340,7 +2353,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); return; @@ -2364,14 +2378,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo if ((OpTy == AMDGPU::OPERAND_REG_IMM_FP64 || OpTy == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64) && - CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); + OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64)) { + if (CanUse64BitLiterals && Lit == LitModifier::None && + (isInt<32>(Val) || isUInt<32>(Val))) { + // The floating-point operand will be verbalized as an + // integer one. If that integer happens to fit 32 bits, on + // re-assembling it will be intepreted as the high half of + // the actual value, so we have to wrap it into lit64(). + Lit = LitModifier::Lit64; + } else if (Lit == LitModifier::Lit) { + // For FP64 operands lit() specifies the high half of the value. + Val = Hi_32(Val); + } } - return; + break; } // We don't allow fp literals in 64-bit integer instructions. It is @@ -2380,19 +2400,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo llvm_unreachable("fp literal in 64-bit integer instruction."); case AMDGPU::OPERAND_KIMM64: - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + if (CanUse64BitLiterals && Lit == LitModifier::None && + (isInt<32>(Val) || isUInt<32>(Val))) + Lit = LitModifier::Lit64; + break; case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2BF16: - if (AsmParser->hasInv2PiInlineImm() && Literal == 0x3fc45f306725feed) { + if (Lit == LitModifier::None && AsmParser->hasInv2PiInlineImm() && + Literal == 0x3fc45f306725feed) { // This is the 1/(2*pi) which is going to be truncated to bf16 with the // loss of precision. The constant represents ideomatic fp32 value of // 1/(2*pi) = 0.15915494 since bf16 is in fact fp32 with cleared low 16 @@ -2430,14 +2448,19 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() - uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); - Inst.addOperand(MCOperand::createImm(ImmVal)); - return; + Val = FPLiteral.bitcastToAPInt().getZExtValue(); + break; } default: llvm_unreachable("invalid operand size"); } + if (Lit != LitModifier::None) { + Inst.addOperand( + MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; } @@ -2457,12 +2480,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: - Inst.addOperand(MCOperand::createImm(Val)); - return; + break; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -2471,22 +2494,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // truncated to uint32_t), if the target doesn't support 64-bit literals, or // the lit modifier is explicitly used, we need to truncate it to the 32 // LSBs. - if (!AsmParser->has64BitLiterals() || - getModifiers().Lit == LitModifier::Lit) + if (!AsmParser->has64BitLiterals() || Lit == LitModifier::Lit) Val = Lo_32(Val); - - if (CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val))) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + break; case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -2501,19 +2517,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // 1) explicitly forced by using lit modifier; // 2) the value is a valid 32-bit representation (signed or unsigned), // meanwhile not forced by lit64 modifier. - if (getModifiers().Lit == LitModifier::Lit || - (getModifiers().Lit != LitModifier::Lit64 && - (isInt<32>(Val) || isUInt<32>(Val)))) + if (Lit == LitModifier::Lit || + (Lit != LitModifier::Lit64 && (isInt<32>(Val) || isUInt<32>(Val)))) Val = static_cast<uint64_t>(Val) << 32; } - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + // For FP64 operands lit() specifies the high half of the value. + if (Lit == LitModifier::Lit) + Val = Hi_32(Val); + break; case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: @@ -2526,25 +2538,23 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: - Inst.addOperand(MCOperand::createImm(Val)); - return; + break; case AMDGPU::OPERAND_KIMM64: - if ((isInt<32>(Val) || isUInt<32>(Val)) && - getModifiers().Lit != LitModifier::Lit64) + if ((isInt<32>(Val) || isUInt<32>(Val)) && Lit != LitModifier::Lit64) Val <<= 32; - - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + break; default: llvm_unreachable("invalid operand type"); } + + if (Lit != LitModifier::None) { + Inst.addOperand( + MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } } void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { @@ -4107,7 +4117,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc) { if ((DMaskIdx == -1 || TFEIdx == -1) && isGFX10_AEncoding()) // intersect_ray return true; - unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); + unsigned VDataSize = getRegOperandSize(Desc, VDataIdx); unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) @@ -4171,8 +4181,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; unsigned ActualAddrSize = - IsNSA ? SrsrcIdx - VAddr0Idx - : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + IsNSA ? SrsrcIdx - VAddr0Idx : getRegOperandSize(Desc, VAddr0Idx) / 4; unsigned ExpectedAddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); @@ -4182,8 +4191,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { ExpectedAddrSize > getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) { int VAddrLastIdx = SrsrcIdx - 1; - unsigned VAddrLastSize = - AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4; + unsigned VAddrLastSize = getRegOperandSize(Desc, VAddrLastIdx) / 4; ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize; } @@ -4429,7 +4437,8 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, return true; const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (TRI->getRegClass(Desc.operands()[0].RegClass).getSizeInBits() <= 128) + if (TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[0], HwMode)) + .getSizeInBits() <= 128) return true; if (TRI->regsOverlap(Src2Reg, DstReg)) { @@ -4814,12 +4823,15 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst, const MCOperand &MO = Inst.getOperand(OpIdx); // Exclude special imm operands (like that used by s_set_gpr_idx_on) if (AMDGPU::isSISrcOperand(Desc, OpIdx)) { + bool IsLit = false; std::optional<int64_t> Imm; if (MO.isImm()) { Imm = MO.getImm(); } else if (MO.isExpr()) { - if (isLitExpr(MO.getExpr())) + if (isLitExpr(MO.getExpr())) { + IsLit = true; Imm = getLitValue(MO.getExpr()); + } } else { continue; } @@ -4829,7 +4841,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst, } else if (!isInlineConstant(Inst, OpIdx)) { auto OpType = static_cast<AMDGPU::OperandType>( Desc.operands()[OpIdx].OperandType); - int64_t Value = encode32BitLiteral(*Imm, OpType); + int64_t Value = encode32BitLiteral(*Imm, OpType, IsLit); if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; @@ -5000,7 +5012,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + AMDGPU::isDPALU_DPP(MII.get(Opc), MII, getSTI())) { // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); @@ -5523,7 +5535,8 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, unsigned Fmt = Inst.getOperand(FmtIdx).getImm(); int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp); unsigned RegSize = - TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits(); + TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[SrcIdx], HwMode)) + .getSizeInBits(); if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32) return true; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 09a66d7..b97b738 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -417,10 +417,10 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> { } class getMUBUFInsDA<list<RegisterOperand> vdataList, - list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> { + list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> { RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getBUFVDataRegisterOperandForOp<vdataClass, isTFE>.ret; + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); @@ -453,8 +453,8 @@ class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, (ins)))))); } @@ -677,8 +677,8 @@ class MUBUF_Pseudo_Store_Lds<string opName> } class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset, - list<RegisterClass> vaddrList=[]> { - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + list<RegisterClassLike> vaddrList=[]> { + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); @@ -702,9 +702,9 @@ class getMUBUFAtomicIns<int addrKind, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, (ins)))))); } @@ -1568,11 +1568,12 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar data_op = getVregSrcForVT<data_vt>.ret; + defvar data_vt_RC = getVregClassForVT<data_vt>.ret; let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) - data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, + data_op:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)), @@ -1583,7 +1584,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string >; defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) - data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + data_op:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), @@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT<vt>.ret; - defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar DataRC = getVregClassForVT<data_vt>.ret; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3); @@ -2088,7 +2089,7 @@ defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>; multiclass MUBUFScratchStorePat_Common <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset)), @@ -2104,7 +2105,7 @@ multiclass MUBUFScratchStorePat_Common <string Instr, multiclass MUBUFScratchStorePat <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { let SubtargetPredicate = HasUnrestrictedSOffset in { defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>; } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index b2ff5a1..d0ad120 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -904,7 +904,7 @@ let SubtargetPredicate = isGFX1250Plus in { let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>; -defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>; +defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96_Align1>; } // End OtherPredicates = [HasTransposeLoadF4F6Insts] defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>; @@ -934,7 +934,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>; defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>; defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>; - defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>; + defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96_Align1>; } //===----------------------------------------------------------------------===// @@ -951,6 +951,11 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat (inst $ptr, Offset:$offset, (i1 gds)) >; +class DSReadPat_t16 <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (EXTRACT_SUBREG (inst $ptr, Offset:$offset, (i1 gds)), lo16) +>; + multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -968,13 +973,14 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> { def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; } - let OtherPredicates = [NotLDSRequiresM0Init] in { - let True16Predicate = NotUseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; - } - let True16Predicate = UseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; - } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = NotUseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = UseTrue16WithSramECC in { + def : DSReadPat_t16<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2120bf8..be62395 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -57,7 +57,9 @@ static int64_t getInlineImmVal64(unsigned Imm); AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), + MAI(*Ctx.getAsmInfo()), + HwModeRegClass(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), + TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) { // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) @@ -825,7 +827,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) { + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + if (Desc.TSFlags & SIInstrFlags::MIMG) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int RsrcIdx = @@ -838,7 +841,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, for (unsigned i = 0; i < NSAArgs; ++i) { const unsigned VAddrIdx = VAddr0Idx + 1 + i; auto VAddrRCID = - MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; + MCII->getOpRegClassID(Desc.operands()[VAddrIdx], HwModeRegClass); MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i])); } Bytes = Bytes.slice(4 * NSAWords); @@ -1311,7 +1314,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { // Widen the register to the correct number of enabled channels. MCRegister NewVdata; if (DstSize != Info->VDataDwords) { - auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass; + auto DataRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VDataIdx], HwModeRegClass); // Get first subregister of VData MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg(); @@ -1338,7 +1342,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; - auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; + auto AddrRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VAddrSAIdx], HwModeRegClass); + const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID); NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC); NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI); @@ -1545,7 +1551,7 @@ AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const { HasLiteral = true; Literal = Literal64 = Val; - bool UseLit64 = Lo_32(Literal64) != 0; + bool UseLit64 = Hi_32(Literal64) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( LitModifier::Lit64, Literal64, getContext())) : MCOperand::createImm(Literal64); @@ -1578,11 +1584,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, if (CanUse64BitLiterals) { if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) - UseLit64 = !isInt<32>(Val) || !isUInt<32>(Val); + UseLit64 = false; else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64) - UseLit64 = Lo_32(Val) != 0; + UseLit64 = Hi_32(Literal64) == 0; } return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( @@ -1608,12 +1614,12 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()]; if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) { - UseLit64 = !isInt<32>(Literal64) || !isUInt<32>(Literal64); + UseLit64 = false; } else { assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64); - UseLit64 = Lo_32(Literal64) != 0; + UseLit64 = Hi_32(Literal64) == 0; } return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 935c383..2751857 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -41,6 +41,7 @@ private: std::unique_ptr<MCInstrInfo const> const MCII; const MCRegisterInfo &MRI; const MCAsmInfo &MAI; + const unsigned HwModeRegClass; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5a22b23..6de59be 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -229,7 +229,7 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { class FLAT_Load_Pseudo< string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let OutOperandList = (outs vdata_op:$vdst); @@ -268,7 +268,7 @@ multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> { + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let InOperandList = !con( (ins VaddrRC:$vaddr, vdataClass:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), @@ -385,7 +385,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy (outs ), !con( !if(IsAsync, (ins VGPR_32:$vdst), (ins)), - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins flat_offset:$offset, CPol_0:$cpol)), !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let LGKM_CNT = !not(IsAsync); @@ -417,7 +417,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata), (ins flat_offset:$offset, CPol_0:$cpol)), " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let VM_CNT = 0; @@ -511,7 +511,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } -class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> : FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { let has_vdst = 0; let has_data = 0; @@ -533,7 +533,7 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { multiclass FLAT_Global_Prefetch_Pseudo<string opName> { let is_flat_global = 1, has_saddr = 1 in { - def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, GlobalSaddrTable<1, opName> { @@ -754,7 +754,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< RegisterOperand data_op = vdst_op> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), " $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName> { let FPAtomic = data_vt.isFP; @@ -786,7 +786,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_vgpr:$vdst), - (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; @@ -811,7 +811,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_agpr:$vdst), - (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn_agpr"> { let FPAtomic = data_vt.isFP; @@ -837,7 +837,7 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType data_vt = vt, RegisterOperand data_op = vdst_op, bit EnableSaddr = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> { let InOperandList = !con( (ins VaddrRC:$vaddr, data_op:$vdata), @@ -867,7 +867,7 @@ class FLAT_Global_Atomic_Pseudo_RTN< RegisterOperand data_op = vdst_op, bit EnableSaddr = false, bit IsVGPR = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> { defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret); @@ -1321,7 +1321,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { } let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { - defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>; + defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96_Align1>; defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>; } @@ -1383,6 +1383,11 @@ class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType v (inst $vaddr, $offset, (i32 0)) >; +class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1393,6 +1398,11 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value (inst $vaddr, $offset, (i32 0)) >; +class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset, (i32 0)), lo16) +>; + class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) @@ -1408,6 +1418,11 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1443,6 +1458,11 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu (inst $saddr, $voffset, $offset, $cpol) >; +class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1519,7 +1539,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1548,7 +1568,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1592,7 +1612,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1625,6 +1645,11 @@ class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $offset, 0) >; +class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) @@ -1645,6 +1670,11 @@ class ScratchLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Val (inst $saddr, $offset, 0) >; +class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), + (EXTRACT_SUBREG (inst $saddr, $offset), lo16) +>; + class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), @@ -1672,6 +1702,11 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16) +>; + multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat_M0 <inst, node> { let AddedComplexity = 10; @@ -1764,6 +1799,16 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value } } +multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatSignedLoadPat_t16<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStoreSignedPat <inst, node, vt> { @@ -1872,8 +1917,8 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, } } -multiclass ScratchFLATStorePats_t16<string inst, SDPatternOperator node, - ValueType vt> { +multiclass ScratchFLATStorePats_D16_t16<string inst, SDPatternOperator node, + ValueType vt> { def : ScratchStoreSignedPat <!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> { let AddedComplexity = 25; } @@ -1918,6 +1963,21 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : ScratchLoadSignedPat_t16 <inst, node, vt> { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadPat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1947,6 +2007,17 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT } } +multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_t16 <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } + + def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStorePat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1997,6 +2068,17 @@ let True16Predicate = NotUseRealTrue16Insts in { defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } +let True16Predicate = UseTrue16WithSramECC in { + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; @@ -2006,11 +2088,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; +} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; -} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts +} defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; @@ -2140,6 +2225,20 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; } +let True16Predicate = UseTrue16WithSramECC in { +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; @@ -2192,6 +2291,13 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; } +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = UseRealTrue16Insts in { +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", store_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>; +} + let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; @@ -2362,14 +2468,24 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>; } -let True16Predicate = UseRealTrue16Insts in { +let True16Predicate = UseTrue16WithSramECC in { +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>; +} + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; -} // End True16Predicate = UseRealTrue16Insts +} // End OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts + +let True16Predicate = UseRealTrue16Insts in { +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; +} foreach vt = Reg32Types.types in { defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 8821558..464cbec 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -722,7 +722,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && - AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *TII, *ST)) { LLVM_DEBUG(dbgs() << " " << OrigMI << " failed: not valid 64-bit DPP control value\n"); break; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 1d9a427..a911e7e 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -869,7 +869,7 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); int VDataRCID = -1; if (VDataIdx != -1) - VDataRCID = Desc.operands()[VDataIdx].RegClass; + VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]); if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { // There is no hazard if the instruction does not use vector regs @@ -893,8 +893,8 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. if (TII->isMIMG(MI)) { int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); - assert(SRsrcIdx != -1 && - AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); + assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID( + Desc.operands()[SRsrcIdx])) == 256); (void)SRsrcIdx; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 3563caa..703ec0a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -73,7 +73,13 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isExpr()) { + MAI.printExpr(O, *Op.getExpr()); + return; + } + + O << formatHex(Op.getImm() & 0xffffffff); } void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo, @@ -788,9 +794,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Check if operand register class contains register used. // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. - int RCID = Desc.operands()[OpNo].RegClass; + const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); if (RCID != -1) { - const MCRegisterClass RC = MRI.getRegClass(RCID); + const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) @@ -1025,7 +1033,7 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && - AMDGPU::isDPALU_DPP(Desc, STI)) { + AMDGPU::isDPALU_DPP(Desc, MII, STI)) { O << " /* DP ALU dpp only supports " << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f2879116..ea758bb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -270,10 +270,19 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo, const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const { const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int64_t Imm; + int64_t Imm = 0; if (MO.isExpr()) { - if (!MO.getExpr()->evaluateAsAbsolute(Imm)) - return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255; + if (!MO.getExpr()->evaluateAsAbsolute(Imm) || + AMDGPU::isLitExpr(MO.getExpr())) { + if (OpInfo.OperandType == AMDGPU::OPERAND_KIMM16 || + OpInfo.OperandType == AMDGPU::OPERAND_KIMM32 || + OpInfo.OperandType == AMDGPU::OPERAND_KIMM64) + return Imm; + if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && + AMDGPU::getOperandSize(OpInfo) == 8) + return 254; + return 255; + } } else { assert(!MO.isDFPImm()); @@ -452,13 +461,16 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // Yes! Encode it int64_t Imm = 0; + bool IsLit = false; if (Op.isImm()) Imm = Op.getImm(); else if (Op.isExpr()) { - if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) + if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) { Imm = C->getValue(); - else if (AMDGPU::isLitExpr(Op.getExpr())) + } else if (AMDGPU::isLitExpr(Op.getExpr())) { + IsLit = true; Imm = AMDGPU::getLitValue(Op.getExpr()); + } } else // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); @@ -468,7 +480,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, } else { auto OpType = static_cast<AMDGPU::OperandType>(Desc.operands()[i].OperandType); - Imm = AMDGPU::encode32BitLiteral(Imm, OpType); + Imm = AMDGPU::encode32BitLiteral(Imm, OpType, IsLit); support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little); } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 291c03a..64e34db 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1516,7 +1516,8 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> { int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11))); RegisterOperand RegClass = MIMGAddrSize<num_addrs, 0>.RegClass; - defvar Size = !cast<SIRegisterClass>(RegClass.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RegClass.RegClass).Size; + int VAddrDwords = !srl(Size, 5); int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 90c828b..6616b30 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1077,7 +1077,7 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - int16_t RCID = Desc.operands()[UseOpIdx].RegClass; + int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]); if (RCID == -1) return false; @@ -1299,10 +1299,8 @@ void SIFoldOperandsImpl::foldOperand( AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO, AMDGPU::AV_MOV_B64_IMM_PSEUDO}) { const MCInstrDesc &MovDesc = TII->get(MovOp); - assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1); - const TargetRegisterClass *MovDstRC = - TRI->getRegClass(MovDesc.operands()[0].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0])); // Fold if the destination register class of the MOV instruction (ResRC) // is a superclass of (or equal to) the destination register class of the @@ -1312,7 +1310,8 @@ void SIFoldOperandsImpl::foldOperand( const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); + if (MovSrcRC) { if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 46757cf..ec5c5bb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5029,9 +5029,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } - int RegClass = Desc.operands()[i].RegClass; - const MCOperandInfo &OpInfo = Desc.operands()[i]; + int16_t RegClass = getOpRegClassID(OpInfo); + switch (OpInfo.OperandType) { case MCOI::OPERAND_REGISTER: if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { @@ -5635,7 +5635,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && - AMDGPU::isDPALU_DPP(Desc, ST)) { + AMDGPU::isDPALU_DPP(Desc, *this, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; @@ -6031,48 +6031,17 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -static const TargetRegisterClass * -adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, - const MCInstrDesc &TID, unsigned RCID) { - if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) { - switch (RCID) { - case AMDGPU::AV_32RegClassID: - RCID = AMDGPU::VGPR_32RegClassID; - break; - case AMDGPU::AV_64RegClassID: - RCID = AMDGPU::VReg_64RegClassID; - break; - case AMDGPU::AV_96RegClassID: - RCID = AMDGPU::VReg_96RegClassID; - break; - case AMDGPU::AV_128RegClassID: - RCID = AMDGPU::VReg_128RegClassID; - break; - case AMDGPU::AV_160RegClassID: - RCID = AMDGPU::VReg_160RegClassID; - break; - case AMDGPU::AV_512RegClassID: - RCID = AMDGPU::VReg_512RegClassID; - break; - default: - break; - } - } - - return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); -} - +// FIXME: This should not be an overridable function. All subtarget dependent +// operand modifications should go through isLookupRegClassByHwMode in the +// generic handling. const TargetRegisterClass * SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const { if (OpNum >= TID.getNumOperands()) return nullptr; - auto RegClass = TID.operands()[OpNum].RegClass; - // Special pseudos have no alignment requirement. - if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID)) - return RI.getRegClass(RegClass); - - return adjustAllocatableRegClass(ST, RI, TID, RegClass); + const MCOperandInfo &OpInfo = TID.operands()[OpNum]; + int16_t RegClass = getOpRegClassID(OpInfo); + return RI.getRegClass(RegClass); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -6090,8 +6059,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getPhysRegBaseClass(Reg); } - unsigned RCID = Desc.operands()[OpNo].RegClass; - return adjustAllocatableRegClass(ST, RI, Desc, RCID); + return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6099,7 +6067,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; + unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]); const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Size = RI.getRegSizeInBits(*RC); unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO @@ -6168,7 +6136,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, Register Reg = MO.getReg(); - const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); + const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo)); if (Reg.isPhysical()) return DRC->contains(Reg); @@ -6293,8 +6261,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; + int64_t RegClass = getOpRegClassID(OpInfo); const TargetRegisterClass *DefinedRC = - OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + RegClass != -1 ? RI.getRegClass(RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); @@ -7619,7 +7588,7 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx, if (!RI.isVGPRClass(CurrRC)) return; - unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]); const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID); if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) { Op.setSubReg(AMDGPU::lo16); @@ -9323,7 +9292,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, // Is this operand statically required to be an SGPR based on the operand // constraints? const TargetRegisterClass *OpRC = - RI.getRegClass(Desc.operands()[Idx].RegClass); + RI.getRegClass(getOpRegClassID(Desc.operands()[Idx])); bool IsRequiredSGPR = RI.isSGPRClass(OpRC); if (IsRequiredSGPR) return MO.getReg(); @@ -9804,7 +9773,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (Idx == -1) // e.g. s_memtime return false; - const auto RCID = MI.getDesc().operands()[Idx].RegClass; + const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]); return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index cc59acf..e979eeb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1298,7 +1298,7 @@ public: return 4; } - return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; + return RI.getRegSizeInBits(*RI.getRegClass(getOpRegClassID(OpInfo))) / 8; } /// This form should usually be preferred since it handles operands diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 18a5393..b7f63ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1151,6 +1151,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +// FIXME: Should change class based on hasSDWAScalar to exclude SGPRs class SDWASrc<ValueType vt> : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; string Type = !if(vt.isFP, "FP", "INT"); @@ -1807,13 +1808,13 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16, VOPDstOperand_t16Lo128), VOPDstOperand<VGPR_32>); - RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>, - !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, - !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, - !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, - !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, - !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, - !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024_AlignTarget>, + !eq(VT.Size, 512) : VOPDstOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 256) : VOPDstOperand<VReg_256_AlignTarget>, + !eq(VT.Size, 192) : VOPDstOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : VOPDstOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : VOPDstOperand<VReg_64_AlignTarget>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, 1 : VOPDstS64orS32); // else VT == i1 @@ -1821,8 +1822,8 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { class getVALUDstForVT_fake16<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, - !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, - !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128_AlignTarget>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64_AlignTarget>, !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32_Lo128>, VOPDstS64orS32)))); // else VT == i1 } @@ -1890,21 +1891,38 @@ class getSOPSrcForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32); } -// Returns the vreg register class to use for source operand given VT +// Returns the vreg register operand to use for source operand given VT. +// This should only be used for a target instruction's ins list. class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { RegisterOperand ret = - !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>, - !eq(VT.Size, 192) : RegisterOperand<VReg_192>, - !eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 192) : RegisterOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128), RegisterOperand<VGPR_32>), 1 : RegisterOperand<VGPR_32>); } +// Returns a concrete vgpr register class to use for a value type VT, +// which exists separately from a real instruction use. +class getVregClassForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { + RegisterClass ret = + !cond(!eq(VT.Size, 512) : VReg_512, + !eq(VT.Size, 192) : VReg_192, + !eq(VT.Size, 128) : VReg_128, + !eq(VT.Size, 96) : VReg_96, + !eq(VT.Size, 64) : VReg_64, + !eq(VT.Size, 48) : VReg_64, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128), + VGPR_32), + 1 : VGPR_32); +} + class getSDWASrcForVT <ValueType VT> { RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); @@ -2638,7 +2656,7 @@ class getAlign2RegOp<RegisterOperand RC> { } class getEquivalentAGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = !cond(!eq(Size, 32) : RegisterOperand<AGPR_32>, !eq(Size, 64) : RegisterOperand<AReg_64>, @@ -2649,16 +2667,33 @@ class getEquivalentAGPROperand<RegisterOperand RC> { } class getEquivalentVGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = - !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>, - !eq(Size, 64) : RegisterOperand<VReg_64>, - !eq(Size, 96) : RegisterOperand<VReg_96>, - !eq(Size, 128) : RegisterOperand<VReg_128>, - !eq(Size, 160) : RegisterOperand<VReg_160>, - !eq(Size, 1024) : RegisterOperand<VReg_1024>); + !cond( + !eq(RC, VGPROp_32) : VGPROp_32, + !eq(RC, VGPROp_64) : VGPROp_64, + + !eq(RC, AVLdSt_32) : VGPROp_32, + !eq(RC, AVLdSt_64) : VGPROp_64, + !eq(RC, AVLdSt_96) : VGPROp_96, + !eq(RC, AVLdSt_128) : VGPROp_128, + !eq(RC, AVLdSt_160) : VGPROp_160, + !eq(RC, AVLdSt_1024) : VGPROp_1024, + + !eq(RC, AVLdSt_64_Align2) : VGPROp_64_Align2, + !eq(RC, AVLdSt_96_Align2) : VGPROp_96_Align2, + !eq(RC, AVLdSt_128_Align2) : VGPROp_128_Align2, + !eq(RC, AVLdSt_160_Align2) : VGPROp_160_Align2, + !eq(RC, AVLdSt_1024_Align2) : VGPROp_1024_Align2, + + !eq(RC, AVLdSt_64_Align1) : VGPROp_64_Align1, + !eq(RC, AVLdSt_96_Align1) : VGPROp_96_Align1, + !eq(RC, AVLdSt_128_Align1) : VGPROp_128_Align1, + !eq(RC, AVLdSt_160_Align1) : VGPROp_160_Align1, + !eq(RC, AVLdSt_1024_Align1) : VGPROp_1024_Align1); } + class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32, ValueType Src2VT = i32> { bit ret = !if(!eq(DstVT.Size, 64), @@ -3190,7 +3225,7 @@ class Commutable_REV <string revOp, bit isOrig> { // Interpolation opcodes //===----------------------------------------------------------------------===// -class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">; +class VINTRPDstOperand <RegisterClassLike rc> : RegisterOperand <rc, "printVINTRPDst">; class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index be084a9..eac9fd4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -120,6 +120,8 @@ def ATOMIC_FENCE : SPseudoInstSI< let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns +// No align needed as it will be decomposed anyway +// TODO: Remove alignment requirement from sources def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { let isPseudo = 1; @@ -129,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0)> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; @@ -163,9 +165,6 @@ def AV_MOV_B32_IMM_PSEUDO // 64-bit materialize immediate which supports AGPR or VGPR. This has // an unusual operand restriction which requires the two halves of the // immediate to each be 32-bit inline immediate values. -// -// FIXME: This unnecessarily has the even aligned vector register -// requirement applied. def AV_MOV_B64_IMM_PSEUDO : VPseudoInstSI<(outs AV_64:$vdst), (ins AV_64_PSEUDO_IMM:$src0)> { let isReMaterializable = 1; @@ -381,13 +380,13 @@ foreach Op = Operations in { let usesCustomInserter = 1, Defs = [VCC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] >; def V_SUB_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] >; } // End usesCustomInserter = 1, Defs = [VCC] @@ -1142,7 +1141,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register // needs to be used and an extra instruction to move between VGPR and AGPR. // UsesTmp adds to the total size of an expanded spill in this case. -multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, +multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class, bit UsesTmp = 0, bit HasMask = 0> { let UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] in { @@ -1177,21 +1176,25 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, } // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] } +// TODO: Technically the AlignTarget register class constraint is +// overly conservative for gfx90a. There is an alignment requirement, +// but the underlying spill will be lowered to 32-bit accesses. + defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>; defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; -defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; -defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; -defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; -defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; -defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; -defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; -defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; -defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; -defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; -defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; -defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; -defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; -defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64_AlignTarget>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96_AlignTarget>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128_AlignTarget>; +defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160_AlignTarget>; +defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192_AlignTarget>; +defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224_AlignTarget>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256_AlignTarget>; +defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288_AlignTarget>; +defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320_AlignTarget>; +defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352_AlignTarget>; +defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384_AlignTarget>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512_AlignTarget>; +defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024_AlignTarget>; let Defs = [M0] in { // Spills a block of 32 VGPRs. M0 will contain a mask describing which @@ -1200,34 +1203,34 @@ let Defs = [M0] in { } defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; -defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; -defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; -defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; -defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; -defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; -defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; -defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; -defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; -defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; -defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; -defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; -defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; -defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; +defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64_AlignTarget, 1>; +defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96_AlignTarget, 1>; +defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128_AlignTarget, 1>; +defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160_AlignTarget, 1>; +defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192_AlignTarget, 1>; +defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224_AlignTarget, 1>; +defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256_AlignTarget, 1>; +defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288_AlignTarget, 1>; +defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320_AlignTarget, 1>; +defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352_AlignTarget, 1>; +defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384_AlignTarget, 1>; +defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512_AlignTarget, 1>; +defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024_AlignTarget, 1>; defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; -defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; -defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; -defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; -defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; -defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; -defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; -defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; -defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; -defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; -defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; -defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; -defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; -defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; +defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64_AlignTarget, 1>; +defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96_AlignTarget, 1>; +defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128_AlignTarget, 1>; +defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160_AlignTarget, 1>; +defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192_AlignTarget, 1>; +defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224_AlignTarget, 1>; +defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256_AlignTarget, 1>; +defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288_AlignTarget, 1>; +defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320_AlignTarget, 1>; +defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352_AlignTarget, 1>; +defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384_AlignTarget, 1>; +defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512_AlignTarget, 1>; +defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024_AlignTarget, 1>; let isConvergent = 1 in { defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; @@ -2383,18 +2386,24 @@ let True16Predicate = UseRealTrue16Insts in { } } -// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit -// immediate and wil be expanded as needed, but we will only use these patterns -// for values which can be encoded. -def : GCNPat < - (VGPRImm<(i64 imm)>:$imm), - (V_MOV_B64_PSEUDO imm:$imm) ->; +/// FIXME: Increasing the priority of VGPRImm over the scalar forms as +/// a workaround for a phase ordering problem caused by overly +/// conservative MachineCSE. If we end up with an s_mov_b64 + copy to +/// vgpr pattern, MachineCSE will not perform the CSE which occurs +/// after operand folding. +let AddedComplexity = 1 in { + // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit + // immediate and wil be expanded as needed, but we will only use these patterns + // for values which can be encoded. + def : GCNPat < + (VGPRImm<(i64 imm)>:$imm), + (V_MOV_B64_PSEUDO imm:$imm)>; -def : GCNPat < - (VGPRImm<(f64 fpimm)>:$imm), - (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) ->; + def : GCNPat < + (VGPRImm<(f64 fpimm)>:$imm), + (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) + >; +} // End let AddedComplexity = 2 def : GCNPat < (i64 imm:$imm), diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index afe76e1..bfac639 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1338,8 +1338,9 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; unsigned I = Op.getOperandNo(); - if (Desc.operands()[I].RegClass == -1 || - !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); + if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) continue; if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3115579..be1c883 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -328,7 +328,8 @@ struct SGPRSpillBuilder { SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), ST.getAMDGPUDwarfFlavour(), - /*PC=*/0, ST.getHwMode()), + /*PC=*/0, + ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 82fc240..fc8f46a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -91,16 +91,23 @@ class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0, int Index = !cast<int>(regIdx); } -// For register classes that use TSFlags. -class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> - : RegisterClass <n, rTypes, Align, rList> { +class SIRegisterClassLike<int BW = 0, bit V = false, + bit A = false, + bit S = false> { + // Bitwidth of the register + field int Size = BW; + // For vector register classes. - field bit HasVGPR = 0; - field bit HasAGPR = 0; + field bit HasVGPR = V; + field bit HasAGPR = A; // For scalar register classes. - field bit HasSGPR = 0; + field bit HasSGPR = S; +} +// For register classes that use TSFlags. +class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> + : RegisterClass <n, rTypes, Align, rList>, SIRegisterClassLike { // Alignment of the first register in tuple (in 32-bit units). field int RegTupleAlignUnits = 1; @@ -991,7 +998,8 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1, BaseClassPriority = 1 in { + let HasVGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeVReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1031,7 +1039,8 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeAReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1197,15 +1206,87 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 } //===----------------------------------------------------------------------===// -// Register operands +// +// AlignTarget classes. Artifical classes to swap between +// even-aligned and any-aligned classes depending on subtarget. +// //===----------------------------------------------------------------------===// +def AV_LdSt_32_Target : RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + let DecoderMethod = "decodeAVLdSt"; +} + +foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { + def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; + } + + def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [!cast<RegisterClass>("AReg_"#RegSize), + !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/]> { + let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; + } + + def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; + } + + def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize)]> { + let DecoderMethod = "decodeAVLdSt"; + } +} + +def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VS_64, VS_64_Align2, VS_64_Align2]> { + let DecoderMethod = "decodeSrcRegOrImm9"; +} + class RegImmMatcher<string name> : AsmOperandClass { let Name = name; let RenderMethod = "addRegOrImmOperands"; } -class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> +class RegOrImmOperand <RegisterClassLike RegClass, string OperandTypeName> : RegisterOperand<RegClass> { let OperandNamespace = "AMDGPU"; let OperandType = OperandTypeName; @@ -1213,14 +1294,18 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> } //===----------------------------------------------------------------------===// +// Register operands +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate // if supported by target. //===----------------------------------------------------------------------===// -class SrcRegOrImm9<RegisterClass regClass, string operandType> +class SrcRegOrImm9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { string DecoderMethodName = "decodeSrcRegOrImm9"; - let DecoderMethod = DecoderMethodName # "<" # regClass.Size # ">"; + let DecoderMethod = DecoderMethodName # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16> @@ -1277,12 +1362,12 @@ def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">; def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">; def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">; def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">; -def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_INT64">; -def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { +def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">; +def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> { let DecoderMethod = "decodeOperand_VSrc_f64"; } -def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; -def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">; +def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">; def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; @@ -1292,19 +1377,19 @@ def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16 // This is for operands with the enum(9), VSrc encoding restriction, // but only allows VGPRs. -class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> { - let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">"; +class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> { + let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } def VRegSrc_32 : SrcReg9<VGPR_32>; -def VRegSrc_64 : SrcReg9<VReg_64>; -def VRegSrc_96 : SrcReg9<VReg_96>; -def VRegSrc_128 : SrcReg9<VReg_128>; -def VRegSrc_192 : SrcReg9<VReg_192>; -def VRegSrc_256 : SrcReg9<VReg_256>; -def VRegSrc_384 : SrcReg9<VReg_384>; -def VRegSrc_512 : SrcReg9<VReg_512>; -def VRegSrc_1024 : SrcReg9<VReg_1024>; +def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>; +def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>; +def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>; +def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>; +def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>; +def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>; +def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>; +def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; // True 16 Operands @@ -1325,23 +1410,23 @@ class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> { class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> { let DecoderMethod = "Decode" # regClass # "RegisterClass"; } -multiclass VGPROp_Aligned<RegisterClass regClass> { - def _Align1 : VGPROp<regClass>; - def _Align2 : VGPROp_Align2<regClass>; -} // TODO: These cases should use default target alignment def VGPROp_16 : VGPROp<VGPR_16> { let EncoderMethod = "getMachineOpValueT16"; } + def VGPROp_32 : VGPROp<VGPR_32>; foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "320", "352", "384", "512", "1024"] in { - def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>; -} + // Target default alignment + def VGPROp_#size : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_AlignTarget)>; + + // No alignment requirement + def VGPROp_#size#_Align1 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size)>; -foreach size = ["64", "96", "128", "160", "256", "1024"] in { - defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>; + // Always even alignment requirement + def VGPROp_#size#_Align2 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_Align2)>; } def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { @@ -1357,9 +1442,9 @@ def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// -class AVOperand<RegisterClass regClass, string decoder> +class AVOperand<RegisterClassLike regClass, string decoder> : RegisterOperand<regClass> { - let DecoderMethod = decoder # "<" # regClass.Size # ">"; + let DecoderMethod = decoder # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; let EncoderMethod = "getAVOperandEncoding"; } @@ -1374,13 +1459,13 @@ def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">; def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">; def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">; def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">; -def VCSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_INT64">; -def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; +def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; -def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">; def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; // True 16 Operands @@ -1391,73 +1476,80 @@ def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">; // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// -def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; //===----------------------------------------------------------------------===// // AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR //===----------------------------------------------------------------------===// -class AVSrcOperand<RegisterClass regClass> +class AVSrcOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeSrcAV10">; def AVSrc_32 : AVSrcOperand<AV_32>; -def AVSrc_64 : AVSrcOperand<AV_64>; -def AVSrc_128 : AVSrcOperand<AV_128>; -def AVSrc_192 : AVSrcOperand<AV_192>; -def AVSrc_256 : AVSrcOperand<AV_256>; +def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>; +def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>; +def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>; +def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>; -class AVDstOperand<RegisterClass regClass> +def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>; +def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>; +def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>; +def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>; + +class AVDstOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAV10">; def AVDst_128 : AVDstOperand<AV_128>; def AVDst_256 : AVDstOperand<AV_256>; def AVDst_512 : AVDstOperand<AV_512>; -class AVLdStOperand<RegisterClass regClass> +def AVDst_128_Align2 : AVDstOperand<AV_128_Align2>; +def AVDst_256_Align2 : AVDstOperand<AV_256_Align2>; +def AVDst_512_Align2 : AVDstOperand<AV_512_Align2>; + +class AVLdStOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAVLdSt">; -def AVLdSt_32 : AVLdStOperand<AV_32>; +def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>; foreach size = ["64", "96", "128", "160", "256", "1024" ] in { - // TODO: These cases should use target align variant - def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - - def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>; + def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>; + def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>; + def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>; } //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// -class SrcRegOrImmA9<RegisterClass regClass, string operandType> +class SrcRegOrImmA9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { - let DecoderMethod = "decodeSrcRegOrImmA9<" # regClass.Size # ">"; + let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } -def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; //===----------------------------------------------------------------------===// // Tablegen programming utilities @@ -1467,10 +1559,10 @@ def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; /// instruction's operand list, which may be a RegisterOperand or a /// direct RegisterClass reference. class getRegClassFromOp<DAGOperand Op> { - SIRegisterClass ret = !if( + SIRegisterClassLike ret = !if( !isa<RegisterOperand>(Op), - !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass), - !cast<SIRegisterClass>(Op)); + !cast<SIRegisterClassLike>(!cast<RegisterOperand>(Op).RegClass), + !cast<SIRegisterClassLike>(Op)); } /// Check if the operand will use an AV_* class. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index f7f4d46..3e1b058 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1569,12 +1569,7 @@ static bool isValidRegPrefix(char C) { return C == 'v' || C == 's' || C == 'a'; } -std::tuple<char, unsigned, unsigned> -parseAsmConstraintPhysReg(StringRef Constraint) { - StringRef RegName = Constraint; - if (!RegName.consume_front("{") || !RegName.consume_back("}")) - return {}; - +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef RegName) { char Kind = RegName.front(); if (!isValidRegPrefix(Kind)) return {}; @@ -1601,6 +1596,14 @@ parseAsmConstraintPhysReg(StringRef Constraint) { return {}; } +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint) { + StringRef RegName = Constraint; + if (!RegName.consume_front("{") || !RegName.consume_back("}")) + return {}; + return parseAsmPhysRegName(RegName); +} + std::pair<unsigned, unsigned> getIntegerPairAttribute(const Function &F, StringRef Name, std::pair<unsigned, unsigned> Default, @@ -2927,13 +2930,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { return getRegBitWidth(RC.getID()); } -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo) { - assert(OpNo < Desc.NumOperands); - unsigned RCID = Desc.operands()[OpNo].RegClass; - return getRegBitWidth(RCID) / 8; -} - bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { if (isInlinableIntLiteral(Literal)) return true; @@ -3149,7 +3145,7 @@ bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { return isUInt<32>(Val) || isInt<32>(Val); } -int64_t encode32BitLiteral(int64_t Imm, OperandType Type) { +int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) { switch (Type) { default: break; @@ -3172,7 +3168,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type) { case OPERAND_REG_INLINE_C_INT32: return Lo_32(Imm); case OPERAND_REG_IMM_FP64: - return Hi_32(Imm); + return IsLit ? Imm : Hi_32(Imm); } return Imm; } @@ -3499,14 +3495,18 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { return false; } -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) { int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); if (Idx == -1) continue; - if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID || - OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID) + const MCOperandInfo &OpInfo = OpDesc.operands()[Idx]; + int16_t RegClass = MII.getOpRegClassID( + OpInfo, ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); + if (RegClass == AMDGPU::VReg_64RegClassID || + RegClass == AMDGPU::VReg_64_Align2RegClassID) return true; } @@ -3533,14 +3533,15 @@ bool isDPALU_DPP32BitOpc(unsigned Opc) { } } -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) return false; if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); - return hasAny64BitVGPROperands(OpDesc); + return hasAny64BitVGPROperands(OpDesc, MII, ST); } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2b9c063..a01a5fd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1014,6 +1014,13 @@ bool isReadOnlySegment(const GlobalValue *GV); bool shouldEmitConstantsToTextSection(const Triple &TT); /// Returns a valid charcode or 0 in the first entry if this is a valid physical +/// register name. Followed by the start register number, and the register +/// width. Does not validate the number of registers exists in the class. Unlike +/// parseAsmConstraintPhysReg, this does not expect the name to be wrapped in +/// "{}". +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef TupleString); + +/// Returns a valid charcode or 0 in the first entry if this is a valid physical /// register constraint. Followed by the start register number, and the register /// width. Does not validate the number of registers exists in the class. std::tuple<char, unsigned, unsigned> @@ -1620,10 +1627,6 @@ unsigned getRegBitWidth(unsigned RCID); /// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); -/// Get size of register operand -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo); - LLVM_READNONE inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { switch (OpInfo.OperandType) { @@ -1724,7 +1727,7 @@ LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); LLVM_READNONE -int64_t encode32BitLiteral(int64_t Imm, OperandType Type); +int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit); bool isArgPassedInSGPR(const Argument *Arg); @@ -1780,13 +1783,15 @@ inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { } /// \returns true if an instruction may have a 64-bit VGPR operand. -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, + const MCSubtargetInfo &ST); /// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. bool isDPALU_DPP32BitOpc(unsigned Opc); /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 30dab55..d87d250 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -405,7 +405,7 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> { field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), !if(!eq(vt.Size, 64), - (ins VSrc_f64:$src0, VReg_64:$src1, ImmOpType:$imm), + (ins VSrc_f64:$src0, VReg_64_AlignTarget:$src1, ImmOpType:$imm), (ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm))); field dag InsVOPDX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm); let InsVOPDX_immX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immX); @@ -474,10 +474,10 @@ def VOP_MADMK_F64 : VOP_MADMK <f64>; // given VT. class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> { RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, RegisterOperand<VGPR_32>, RegisterOperand<VGPR_16>), diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 3a0cc35..7cfd059 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -705,16 +705,16 @@ foreach Type = ["U", "I"] in (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand<AGPR_32>; -def ADst_64 : VOPDstOperand<AReg_64>; -def ADst_128 : VOPDstOperand<AReg_128>; -def ADst_256 : VOPDstOperand<AReg_256>; -def ADst_512 : VOPDstOperand<AReg_512>; -def ADst_1024 : VOPDstOperand<AReg_1024>; -def VDst_64 : VOPDstOperand<VReg_64>; -def VDst_128 : VOPDstOperand<VReg_128>; -def VDst_256 : VOPDstOperand<VReg_256>; -def VDst_512 : VOPDstOperand<VReg_512>; -def VDst_1024 : VOPDstOperand<VReg_1024>; +def ADst_64 : VOPDstOperand<AReg_64_AlignTarget>; +def ADst_128 : VOPDstOperand<AReg_128_AlignTarget>; +def ADst_256 : VOPDstOperand<AReg_256_AlignTarget>; +def ADst_512 : VOPDstOperand<AReg_512_AlignTarget>; +def ADst_1024 : VOPDstOperand<AReg_1024_AlignTarget>; +def VDst_64 : VOPDstOperand<VReg_64_AlignTarget>; +def VDst_128 : VOPDstOperand<VReg_128_AlignTarget>; +def VDst_256 : VOPDstOperand<VReg_256_AlignTarget>; +def VDst_512 : VOPDstOperand<VReg_512_AlignTarget>; +def VDst_1024 : VOPDstOperand<VReg_1024_AlignTarget>; def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { let Src0RC64 = ARegSrc_32; @@ -811,23 +811,23 @@ def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; -def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; - -def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; +def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; + +def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>; def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 2e8a676..ce1cdb3 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -232,6 +232,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::SP); markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); + markSuperRegs(Reserved, ARM::FPSCR_RM); markSuperRegs(Reserved, ARM::APSR_NZCV); if (TFI->isFPReserved(MF)) markSuperRegs(Reserved, STI.getFramePointerReg()); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index f4ac6bb..2a40fb9 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 31650e0..6771106 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -435,14 +435,14 @@ def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">; // FP Binary Operations. // -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", @@ -453,21 +453,21 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", @@ -478,42 +478,42 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", @@ -524,21 +524,21 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Dn = $Dd" in +let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", @@ -549,7 +549,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0, let D = VFPNeonA8Domain; } -let TwoOperandAliasConstraint = "$Sn = $Sd" in +let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", @@ -589,7 +589,7 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>; multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", - isUnpredicable = 1 in { + isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"), @@ -621,7 +621,7 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b), (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; // These are encoded as unary instructions. -let Defs = [FPSCR_NZCV] in { +let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", "", @@ -684,7 +684,7 @@ def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0, IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm", [(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>; -let Defs = [FPSCR_NZCV] in { +let Defs = [FPSCR_NZCV], mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", "", @@ -742,6 +742,7 @@ def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0, } } // Defs = [FPSCR_NZCV] +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", @@ -762,6 +763,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, } // Special case encoding: bits 11-8 is 0b1011. +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", [(set SPR:$Sd, (fpround DPR:$Dm))]>, @@ -787,7 +789,7 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, } // Between half, single and double-precision. -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", "", [/* Intentionally left blank, see patterns below */]>, @@ -799,7 +801,7 @@ def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda", [/* Intentionally left blank, see patterns below */]>, @@ -821,7 +823,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", "", [/* Intentionally left blank, see patterns below */]>, @@ -835,7 +837,7 @@ def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; -let hasSideEffects = 0 in +let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", "$Sd = $Sda", [/* Intentionally left blank, see patterns below */]>, @@ -853,6 +855,7 @@ def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", "", @@ -876,6 +879,7 @@ def : FP16Pat<(f64 (f16_to_fp GPR:$a)), (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm), NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda", @@ -901,6 +905,7 @@ def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), (i32 (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$a), GPR))>, Requires<[HasFPARMv8, HasDPVFP]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm", "", @@ -915,6 +920,7 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, let hasSideEffects = 0; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, DPR:$Dm), NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm", "$Sd = $Sda", @@ -934,7 +940,8 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, multiclass vcvt_inst<string opc, bits<2> rm, SDPatternOperator node = null_frag> { - let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0, + mayRaiseFPException = 1 in { def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0, (outs SPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"), @@ -1055,7 +1062,9 @@ def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0, IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm", [(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>; -multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { +multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node, + list<Register> uses = [], bit fpexc = 0> { + let Uses = uses, mayRaiseFPException = fpexc in { def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm", @@ -1081,6 +1090,7 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { let Inst{7} = op2; let Inst{16} = op; } + } def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"), (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>, @@ -1093,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm<string opc, bits<2> rm, SDPatternOperator node = null_frag> { @@ -1140,18 +1150,21 @@ defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", @@ -1486,6 +1499,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let hasSideEffects = 0; } +let mayRaiseFPException = 1 in def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", @@ -1502,6 +1516,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSITOD (VLDRS addrmode5:$a))>; } +let mayRaiseFPException = 1 in def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd),(ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", @@ -1520,6 +1535,7 @@ def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; +let mayRaiseFPException = 1 in def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm", @@ -1532,6 +1548,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; +let mayRaiseFPException = 1 in def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", @@ -1548,6 +1565,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VUITOD (VLDRS addrmode5:$a))>; } +let mayRaiseFPException = 1 in def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", @@ -1566,6 +1584,7 @@ def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; +let mayRaiseFPException = 1 in def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm", @@ -1640,6 +1659,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, } // Always set Z bit in the instruction, i.e. "round towards zero" variants. +let mayRaiseFPException = 1 in def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", @@ -1660,6 +1680,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; } +let mayRaiseFPException = 1 in def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", @@ -1684,6 +1705,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; +let mayRaiseFPException = 1 in def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm", @@ -1698,6 +1720,7 @@ def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; +let mayRaiseFPException = 1 in def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", @@ -1718,6 +1741,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; } +let mayRaiseFPException = 1 in def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", @@ -1742,6 +1766,7 @@ def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; +let mayRaiseFPException = 1 in def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm", @@ -1757,7 +1782,7 @@ def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. -let Uses = [FPSCR] in { +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in { def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm", @@ -1807,7 +1832,7 @@ def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let Inst{7} = 0; // Z bit let isUnpredicable = 1; } -} +} // mayRaiseFPException = 1, Uses = [FPSCR_RM] // v8.3-a Javascript Convert to Signed fixed-point def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011, @@ -1825,7 +1850,7 @@ def VJCVT : AVConv1IsD_Encode<0b11101, 0b11, 0b1001, 0b1011, // S32 (U=0, sx=1) -> SL // U32 (U=1, sx=1) -> UL -let Constraints = "$a = $dst" in { +let Constraints = "$a = $dst", mayRaiseFPException = 1 in { // FP to Fixed-Point: @@ -2026,9 +2051,10 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1, IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>, Sched<[WriteFPCVT]>; -} // End of 'let Constraints = "$a = $dst" in' +} // End of 'let Constraints = "$a = $dst", mayRaiseFPException = 1 in' // BFloat16 - Single precision, unary, predicated +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in class BF16_VCVT<string opc, bits<2> op7_6> : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm), VFPUnaryFrm, NoItinerary, @@ -2063,6 +2089,7 @@ def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>; // FP Multiply-Accumulate Operations. // +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAD : ADbI<0b11100, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", @@ -2072,6 +2099,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAS : ASbIn<0b11100, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", @@ -2085,6 +2113,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLAH : AHbI<0b11100, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm", @@ -2104,6 +2133,7 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSD : ADbI<0b11100, 0b00, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", @@ -2113,6 +2143,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSS : ASbIn<0b11100, 0b00, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", @@ -2126,6 +2157,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMLSH : AHbI<0b11100, 0b00, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm", @@ -2144,6 +2176,7 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), (VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", @@ -2153,6 +2186,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAS : ASbI<0b11100, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", @@ -2166,6 +2200,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLAH : AHbI<0b11100, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm", @@ -2196,6 +2231,7 @@ def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)), (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", @@ -2205,6 +2241,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSS : ASbI<0b11100, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", @@ -2217,6 +2254,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, let D = VFPNeonA8Domain; } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMLSH : AHbI<0b11100, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", @@ -2237,6 +2275,7 @@ def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin), //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. // +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAD : ADbI<0b11101, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm", @@ -2246,6 +2285,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAS : ASbIn<0b11101, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm", @@ -2258,6 +2298,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMAH : AHbI<0b11101, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm", @@ -2289,6 +2330,7 @@ def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSD : ADbI<0b11101, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm", @@ -2298,6 +2340,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSS : ASbIn<0b11101, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm", @@ -2310,6 +2353,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFMSH : AHbI<0b11101, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm", @@ -2341,6 +2385,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAD : ADbI<0b11101, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm", @@ -2350,6 +2395,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAS : ASbI<0b11101, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm", @@ -2362,6 +2408,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMAH : AHbI<0b11101, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm", @@ -2400,6 +2447,7 @@ def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin))) (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSD : ADbI<0b11101, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm", @@ -2409,6 +2457,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0, Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSS : ASbI<0b11101, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", @@ -2420,6 +2469,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0, // VFP pipelines. } +let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VFNMSH : AHbI<0b11101, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm", diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index 5a31b88..de42195 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -177,8 +177,9 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; } // Current Program Status Register. -// We model fpscr with two registers: FPSCR models the control bits and will be -// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV +// We model fpscr with three registers. FPSCR models the control bits and will be +// reserved. FPSCR_RM models rounding mode control bits and will be reserved. +// FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV // models the APSR when it's accessed by some special instructions. In such cases // it has the same encoding as PC. def CPSR : ARMReg<0, "cpsr">; @@ -189,6 +190,9 @@ def FPSCR : ARMReg<3, "fpscr">; def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> { let Aliases = [FPSCR]; } +def FPSCR_RM : ARMReg<3, "fpscr_rm"> { + let Aliases = [FPSCR]; +} def ITSTATE : ARMReg<4, "itstate">; // Special Registers - only available in privileged mode. diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp index b202b202..e3c39a1 100644 --- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp +++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsBPF.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -478,9 +479,95 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I, } } +static Value *wrapPtrIfASNotZero(DenseMap<Value *, Value *> &Cache, + CallInst *CI, Value *P) { + if (auto *PTy = dyn_cast<PointerType>(P->getType())) { + if (PTy->getAddressSpace() == 0) + return P; + } + return aspaceWrapValue(Cache, CI->getFunction(), P); +} + +static Instruction *aspaceMemSet(Intrinsic::ID ID, + DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + if (OldDst == NewDst) + return nullptr; + + // memset(new_dst, val, len, align, isvolatile, md) + Value *Val = CI->getArgOperand(1); + Value *Len = CI->getArgOperand(2); + + auto *MS = cast<MemSetInst>(CI); + MaybeAlign Align = MS->getDestAlign(); + bool IsVolatile = MS->isVolatile(); + + if (ID == Intrinsic::memset) + return B.CreateMemSet(NewDst, Val, Len, Align, IsVolatile, + MI->getAAMetadata()); + else + return B.CreateMemSetInline(NewDst, Align, Val, Len, IsVolatile, + MI->getAAMetadata()); +} + +static Instruction *aspaceMemCpy(Intrinsic::ID ID, + DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *OldSrc = CI->getArgOperand(1); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc); + if (OldDst == NewDst && OldSrc == NewSrc) + return nullptr; + + // memcpy(new_dst, dst_align, new_src, src_align, len, isvolatile, md) + Value *Len = CI->getArgOperand(2); + + auto *MT = cast<MemTransferInst>(CI); + MaybeAlign DstAlign = MT->getDestAlign(); + MaybeAlign SrcAlign = MT->getSourceAlign(); + bool IsVolatile = MT->isVolatile(); + + return B.CreateMemTransferInst(ID, NewDst, DstAlign, NewSrc, SrcAlign, Len, + IsVolatile, MI->getAAMetadata()); +} + +static Instruction *aspaceMemMove(DenseMap<Value *, Value *> &Cache, + CallInst *CI) { + auto *MI = cast<MemIntrinsic>(CI); + IRBuilder<> B(CI); + + Value *OldDst = CI->getArgOperand(0); + Value *OldSrc = CI->getArgOperand(1); + Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst); + Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc); + if (OldDst == NewDst && OldSrc == NewSrc) + return nullptr; + + // memmove(new_dst, dst_align, new_src, src_align, len, isvolatile, md) + Value *Len = CI->getArgOperand(2); + + auto *MT = cast<MemTransferInst>(CI); + MaybeAlign DstAlign = MT->getDestAlign(); + MaybeAlign SrcAlign = MT->getSourceAlign(); + bool IsVolatile = MT->isVolatile(); + + return B.CreateMemMove(NewDst, DstAlign, NewSrc, SrcAlign, Len, IsVolatile, + MI->getAAMetadata()); +} + // Support for BPF address spaces: // - for each function in the module M, update pointer operand of // each memory access instruction (load/store/cmpxchg/atomicrmw) +// or intrinsic call insns (memset/memcpy/memmove) // by casting it from non-zero address space to zero address space, e.g: // // (load (ptr addrspace (N) %p) ...) @@ -493,21 +580,60 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) { for (Function &F : M) { DenseMap<Value *, Value *> CastsCache; for (BasicBlock &BB : F) { - for (Instruction &I : BB) { + for (Instruction &I : llvm::make_early_inc_range(BB)) { unsigned PtrOpNum; - if (auto *LD = dyn_cast<LoadInst>(&I)) + if (auto *LD = dyn_cast<LoadInst>(&I)) { PtrOpNum = LD->getPointerOperandIndex(); - else if (auto *ST = dyn_cast<StoreInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *ST = dyn_cast<StoreInst>(&I)) { PtrOpNum = ST->getPointerOperandIndex(); - else if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) { PtrOpNum = CmpXchg->getPointerOperandIndex(); - else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) { PtrOpNum = RMW->getPointerOperandIndex(); + aspaceWrapOperand(CastsCache, &I, PtrOpNum); + continue; + } + + auto *CI = dyn_cast<CallInst>(&I); + if (!CI) + continue; + + Function *Callee = CI->getCalledFunction(); + if (!Callee || !Callee->isIntrinsic()) + continue; + + // Check memset/memcpy/memmove + Intrinsic::ID ID = Callee->getIntrinsicID(); + bool IsSet = ID == Intrinsic::memset || ID == Intrinsic::memset_inline; + bool IsCpy = ID == Intrinsic::memcpy || ID == Intrinsic::memcpy_inline; + bool IsMove = ID == Intrinsic::memmove; + if (!IsSet && !IsCpy && !IsMove) + continue; + + Instruction *New; + if (IsSet) + New = aspaceMemSet(ID, CastsCache, CI); + else if (IsCpy) + New = aspaceMemCpy(ID, CastsCache, CI); else + New = aspaceMemMove(CastsCache, CI); + + if (!New) continue; - aspaceWrapOperand(CastsCache, &I, PtrOpNum); + I.replaceAllUsesWith(New); + New->takeName(&I); + I.eraseFromParent(); } } Changed |= !CastsCache.empty(); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 71c21e4..53633ea 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -675,6 +675,45 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC, CC = getRISCVCCFromICmp(Pred); } +/// Select the RISC-V Zalasr opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. +static unsigned selectZalasrLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + default: + llvm_unreachable("Unexpected memory size"); + case 8: + return IsStore ? RISCV::SB_RL : RISCV::LB_AQ; + case 16: + return IsStore ? RISCV::SH_RL : RISCV::LH_AQ; + case 32: + return IsStore ? RISCV::SW_RL : RISCV::LW_AQ; + case 64: + return IsStore ? RISCV::SD_RL : RISCV::LD_AQ; + } +} + +/// Select the RISC-V regimm opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. \returns \p GenericOpc if the combination is unsupported. +static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + case 8: + // Prefer unsigned due to no c.lb in Zcb. + return IsStore ? RISCV::SB : RISCV::LBU; + case 16: + return IsStore ? RISCV::SH : RISCV::LH; + case 32: + return IsStore ? RISCV::SW : RISCV::LW; + case 64: + return IsStore ? RISCV::SD : RISCV::LD; + } + + return GenericOpc; +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -736,6 +775,62 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { MI.eraseFromParent(); return true; } + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_SEXT: { + bool IsSigned = Opc != TargetOpcode::G_ZEXT; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI->getType(SrcReg); + unsigned SrcSize = SrcTy.getSizeInBits(); + + if (SrcTy.isVector()) + return false; // Should be handled by imported patterns. + + assert((*RBI.getRegBank(DstReg, *MRI, TRI)).getID() == + RISCV::GPRBRegBankID && + "Unexpected ext regbank"); + + // Use addiw SrcReg, 0 (sext.w) for i32. + if (IsSigned && SrcSize == 32) { + MI.setDesc(TII.get(RISCV::ADDIW)); + MI.addOperand(MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use add.uw SrcReg, X0 (zext.w) for i32 with Zba. + if (!IsSigned && SrcSize == 32 && STI.hasStdExtZba()) { + MI.setDesc(TII.get(RISCV::ADD_UW)); + MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use sext.h/zext.h for i16 with Zbb. + if (SrcSize == 16 && STI.hasStdExtZbb()) { + MI.setDesc(TII.get(IsSigned ? RISCV::SEXT_H + : STI.isRV64() ? RISCV::ZEXT_H_RV64 + : RISCV::ZEXT_H_RV32)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Use pack(w) SrcReg, X0 for i16 zext with Zbkb. + if (!IsSigned && SrcSize == 16 && STI.hasStdExtZbkb()) { + MI.setDesc(TII.get(STI.is64Bit() ? RISCV::PACKW : RISCV::PACK)); + MI.addOperand(MachineOperand::CreateReg(RISCV::X0, /*isDef=*/false)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + // Fall back to shift pair. + auto ShiftLeft = + MIB.buildInstr(RISCV::SLLI, {&RISCV::GPRRegClass}, {SrcReg}) + .addImm(STI.getXLen() - SrcSize); + constrainSelectedInstRegOperands(*ShiftLeft, TII, TRI, RBI); + auto ShiftRight = MIB.buildInstr(IsSigned ? RISCV::SRAI : RISCV::SRLI, + {DstReg}, {ShiftLeft}) + .addImm(STI.getXLen() - SrcSize); + constrainSelectedInstRegOperands(*ShiftRight, TII, TRI, RBI); + MI.eraseFromParent(); + return true; + } case TargetOpcode::G_FCONSTANT: { // TODO: Use constant pool for complex constants. Register DstReg = MI.getOperand(0).getReg(); @@ -836,6 +931,59 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return selectImplicitDef(MI, MIB); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(MI, MIB); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + GLoadStore &LdSt = cast<GLoadStore>(MI); + const Register ValReg = LdSt.getReg(0); + const Register PtrReg = LdSt.getPointerReg(); + LLT PtrTy = MRI->getType(PtrReg); + + const RegisterBank &RB = *RBI.getRegBank(ValReg, *MRI, TRI); + if (RB.getID() != RISCV::GPRBRegBankID) + return false; + +#ifndef NDEBUG + const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, *MRI, TRI); + // Check that the pointer register is valid. + assert(PtrRB.getID() == RISCV::GPRBRegBankID && + "Load/Store pointer operand isn't a GPR"); + assert(PtrTy.isPointer() && "Load/Store pointer operand isn't a pointer"); +#endif + + // Can only handle AddressSpace 0. + if (PtrTy.getAddressSpace() != 0) + return false; + + unsigned MemSize = LdSt.getMemSizeInBits().getValue(); + AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); + + if (isStrongerThanMonotonic(Order)) { + MI.setDesc(TII.get(selectZalasrLoadStoreOp(Opc, MemSize))); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + const unsigned NewOpc = selectRegImmLoadStoreOp(MI.getOpcode(), MemSize); + if (NewOpc == MI.getOpcode()) + return false; + + // Check if we can fold anything into the addressing mode. + auto AddrModeFns = selectAddrRegImm(MI.getOperand(1)); + if (!AddrModeFns) + return false; + + // Folded something. Create a new instruction and return it. + auto NewInst = MIB.buildInstr(NewOpc, {}, {}, MI.getFlags()); + if (isa<GStore>(MI)) + NewInst.addUse(ValReg); + else + NewInst.addDef(ValReg); + NewInst.cloneMemRefs(MI); + for (auto &Fn : *AddrModeFns) + Fn(NewInst); + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); + } default: return false; } diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index a537904..5dd4bf4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -166,7 +166,7 @@ static unsigned getLRForRMW32(AtomicOrdering Ordering, return RISCV::LR_W; return RISCV::LR_W_AQ; case AtomicOrdering::SequentiallyConsistent: - return RISCV::LR_W_AQ_RL; + return RISCV::LR_W_AQRL; } } @@ -210,7 +210,7 @@ static unsigned getLRForRMW64(AtomicOrdering Ordering, return RISCV::LR_D; return RISCV::LR_D_AQ; case AtomicOrdering::SequentiallyConsistent: - return RISCV::LR_D_AQ_RL; + return RISCV::LR_D_AQRL; } } @@ -287,8 +287,8 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -375,8 +375,8 @@ static void doMaskedAtomicBinOpExpansion(const RISCVInstrInfo *TII, ScratchReg); BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -535,8 +535,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( // sc.w scratch1, scratch1, (addr) // bnez scratch1, loop BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), Scratch1Reg) - .addReg(AddrReg) - .addReg(Scratch1Reg); + .addReg(Scratch1Reg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(Scratch1Reg) .addReg(RISCV::X0) @@ -674,8 +674,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( // bnez scratch, loophead BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(NewValReg); + .addReg(NewValReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -707,8 +707,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( MaskReg, ScratchReg); BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 27cf057..40c05e8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -265,7 +265,7 @@ def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">, def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">; def FeatureStdExtZalasr - : RISCVExperimentalExtension<0, 1, "Load-Acquire and Store-Release Instructions">; + : RISCVExperimentalExtension<0, 9, "Load-Acquire and Store-Release Instructions">; def HasStdExtZalasr : Predicate<"Subtarget->hasStdExtZalasr()">, AssemblerPredicate<(all_of FeatureStdExtZalasr), "'Zalasr' (Load-Acquire and Store-Release Instructions)">; diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 6d01250..eba35ef 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -100,119 +100,11 @@ def : LdPat<load, LD, PtrVT>; def : StPat<store, SD, GPR, PtrVT>; } -// Load and store patterns for i16, needed because Zfh makes s16 load/store -// legal and regbank select may not constrain registers to FP. -def : LdPat<load, LH, i16>; -def : StPat<store, SH, GPR, i16>; - -def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb. -def : StPat<truncstorei8, SB, GPR, i16>; - -let Predicates = [HasAtomicLdSt] in { - // Prefer unsigned due to no c.lb in Zcb. - def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>; - def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>; - - def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>; - def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>; -} - -let Predicates = [HasAtomicLdSt, IsRV64] in { - // Load pattern is in RISCVInstrInfoA.td and shared with RV32. - def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>; -} - //===----------------------------------------------------------------------===// // RV64 i32 patterns not used by SelectionDAG //===----------------------------------------------------------------------===// let Predicates = [IsRV64] in { -def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb. -def : LdPat<extloadi16, LH, i32>; - -def : StPat<truncstorei8, SB, GPR, i32>; -def : StPat<truncstorei16, SH, GPR, i32>; - -def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>; - def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32), (ADDIW GPR:$rs1, simm12_lo:$imm)>; } - -let Predicates = [IsRV64, NoStdExtZba] in -def : Pat<(zext (i32 GPR:$src)), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>; - -let Predicates = [IsRV32, NoStdExtZbb, NoStdExtZbkb] in -def : Pat<(XLenVT (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 16)), 16)>; - -let Predicates = [IsRV64, NoStdExtZbb, NoStdExtZbkb] in { -def : Pat<(i64 (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>; -def : Pat<(i32 (zext (i16 GPR:$src))), - (SRLI (XLenVT (SLLI GPR:$src, 48)), 48)>; -} - -let Predicates = [IsRV32, NoStdExtZbb] in -def : Pat<(XLenVT (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 16)), 16)>; - -let Predicates = [IsRV64, NoStdExtZbb] in { -def : Pat<(i64 (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>; -def : Pat<(i32 (sext (i16 GPR:$src))), - (SRAI (XLenVT (SLLI GPR:$src, 48)), 48)>; -} - -//===----------------------------------------------------------------------===// -// Zb* RV64 patterns not used by SelectionDAG. -//===----------------------------------------------------------------------===// - -let Predicates = [HasStdExtZba, IsRV64] in { -def : Pat<(zext (i32 GPR:$src)), (ADD_UW GPR:$src, (XLenVT X0))>; -} - -let Predicates = [HasStdExtZbb] in -def : Pat<(i32 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>; -let Predicates = [HasStdExtZbb, IsRV64] in -def : Pat<(i64 (sext (i16 GPR:$rs))), (SEXT_H GPR:$rs)>; - -let Predicates = [HasStdExtZbb, IsRV32] in -def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV32 GPR:$rs)>; -let Predicates = [HasStdExtZbb, IsRV64] in { -def : Pat<(i64 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>; -def : Pat<(i32 (zext (i16 GPR:$rs))), (ZEXT_H_RV64 GPR:$rs)>; -} - -let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV32] in -def : Pat<(i32 (zext (i16 GPR:$rs))), (PACK GPR:$rs, (XLenVT X0))>; -let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in { -def : Pat<(i64 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; -def : Pat<(i32 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; -} - -//===----------------------------------------------------------------------===// -// Zalasr patterns not used by SelectionDAG -//===----------------------------------------------------------------------===// - -let Predicates = [HasStdExtZalasr] in { - // the sequentially consistent loads use - // .aq instead of .aqrl to match the psABI/A.7 - def : PatLAQ<acquiring_load<atomic_load_aext_8>, LB_AQ, i16>; - def : PatLAQ<seq_cst_load<atomic_load_aext_8>, LB_AQ, i16>; - - def : PatLAQ<acquiring_load<atomic_load_nonext_16>, LH_AQ, i16>; - def : PatLAQ<seq_cst_load<atomic_load_nonext_16>, LH_AQ, i16>; - - def : PatSRL<releasing_store<atomic_store_8>, SB_RL, i16>; - def : PatSRL<seq_cst_store<atomic_store_8>, SB_RL, i16>; - - def : PatSRL<releasing_store<atomic_store_16>, SH_RL, i16>; - def : PatSRL<seq_cst_store<atomic_store_16>, SH_RL, i16>; -} - -let Predicates = [HasStdExtZalasr, IsRV64] in { - // Load pattern is in RISCVInstrInfoZalasr.td and shared with RV32. - def : PatSRL<releasing_store<atomic_store_32>, SW_RL, i32>; - def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL, i32>; -} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a3a4cf2..7123a2d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15721,8 +15721,7 @@ static SDValue combineAddOfBooleanXor(SDNode *N, SelectionDAG &DAG) { return SDValue(); // Emit a negate of the setcc. - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - N0.getOperand(0)); + return DAG.getNegative(N0.getOperand(0), DL, VT); } static SDValue performADDCombine(SDNode *N, @@ -16974,7 +16973,7 @@ performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc)) if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG()) - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + return DAG.getNegative(Src, DL, VT); // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1) if (Opc == ISD::XOR && SrcVT == MVT::i1 && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 9855c47..7a14929 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1980,7 +1980,7 @@ def : LdPat<sextloadi8, LB>; def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb. def : LdPat<sextloadi16, LH>; def : LdPat<extloadi16, LH>; -def : LdPat<load, LW, i32>; +def : LdPat<load, LW, i32>, Requires<[IsRV32]>; def : LdPat<zextloadi8, LBU>; def : LdPat<zextloadi16, LHU>; @@ -1994,7 +1994,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy, def : StPat<truncstorei8, SB, GPR, XLenVT>; def : StPat<truncstorei16, SH, GPR, XLenVT>; -def : StPat<store, SW, GPR, i32>; +def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>; /// Fences diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 25accd9..571d72f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -24,36 +24,36 @@ class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr> } multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> { - def "" : LR_r<0, 0, funct3, opcodestr>; - def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">; - def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : LR_r<1, 1, funct3, opcodestr # ".aqrl">; + def "" : LR_r<0, 0, funct3, opcodestr>; + def _AQ : LR_r<1, 0, funct3, opcodestr # ".aq">; + def _RL : LR_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : LR_r<1, 1, funct3, opcodestr # ".aqrl">; } let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in class SC_r<bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<0b00011, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> { - def "" : SC_r<0, 0, funct3, opcodestr>; - def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">; - def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : SC_r<1, 1, funct3, opcodestr # ".aqrl">; + def "" : SC_r<0, 0, funct3, opcodestr>; + def _AQ : SC_r<1, 0, funct3, opcodestr # ".aq">; + def _RL : SC_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : SC_r<1, 1, funct3, opcodestr # ".aqrl">; } let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> { - def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>; - def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">; - def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">; + def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>; + def _AQ : AMO_rr<funct5, 1, 0, funct3, opcodestr # ".aq">; + def _RL : AMO_rr<funct5, 0, 1, funct3, opcodestr # ".rl">; + def _AQRL : AMO_rr<funct5, 1, 1, funct3, opcodestr # ".aqrl">; } //===----------------------------------------------------------------------===// @@ -174,8 +174,9 @@ let Predicates = [HasAtomicLdSt] in { def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>; +} - // Used by GISel for RV32 and RV64. +let Predicates = [HasAtomicLdSt, IsRV32] in { def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>; } @@ -188,31 +189,34 @@ let Predicates = [HasAtomicLdSt, IsRV64] in { /// AMOs +class PatAMO<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT> + : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs2, GPR:$rs1)>; + multiclass AMOPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtA, NoStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst#"_AQ"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst#"_RL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst#"_AQ_RL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst#"_AQ_RL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst#"_AQ"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst#"_RL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; } let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst), vt>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td index 7cf6d5f..20e2142 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -9,8 +9,8 @@ // This file describes the RISC-V instructions from the standard atomic 'Za*' // extensions: // - Zawrs (v1.0) : Wait-on-Reservation-Set. -// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap. -// - Zabha (v1.0-rc1) : Byte and Halfword Atomic Memory Operations. +// - Zacas (v1.0) : Atomic Compare-and-Swap. +// - Zabha (v1.0) : Byte and Halfword Atomic Memory Operations. // //===----------------------------------------------------------------------===// @@ -44,15 +44,15 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "$rd = $rd_wb" class AMO_cas<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr, DAGOperand RC> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs RC:$rd_wb), (ins RC:$rd, GPRMemZeroOffset:$rs1, RC:$rs2), + (outs RC:$rd_wb), (ins RC:$rd, RC:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_cas_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr, DAGOperand RC> { - def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>; - def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>; - def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>; - def _AQ_RL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>; + def "" : AMO_cas<funct5, 0, 0, funct3, opcodestr, RC>; + def _AQ : AMO_cas<funct5, 1, 0, funct3, opcodestr # ".aq", RC>; + def _RL : AMO_cas<funct5, 0, 1, funct3, opcodestr # ".rl", RC>; + def _AQRL : AMO_cas<funct5, 1, 1, funct3, opcodestr # ".aqrl", RC>; } let Predicates = [HasStdExtZacas], IsSignExtendingOpW = 1 in { @@ -71,48 +71,48 @@ defm AMOCAS_Q : AMO_cas_aq_rl<0b00101, 0b100, "amocas.q", GPRPairRV64>; multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) let Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) } @@ -140,7 +140,7 @@ def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>; // Zabha (Byte and Halfword Atomic Memory Operations) //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZabha] in { +let Predicates = [HasStdExtZabha], IsSignExtendingOpW = 1 in { defm AMOSWAP_B : AMO_rr_aq_rl<0b00001, 0b000, "amoswap.b">, Sched<[WriteAtomicB, ReadAtomicBA, ReadAtomicBD]>; defm AMOADD_B : AMO_rr_aq_rl<0b00000, 0b000, "amoadd.b">, @@ -181,7 +181,7 @@ defm AMOMAXU_H : AMO_rr_aq_rl<0b11100, 0b001, "amomaxu.h">, } // If Zacas extension is also implemented, Zabha further provides AMOCAS.[B|H]. -let Predicates = [HasStdExtZabha, HasStdExtZacas] in { +let Predicates = [HasStdExtZabha, HasStdExtZacas], IsSignExtendingOpW = 1 in { defm AMOCAS_B : AMO_cas_aq_rl<0b00101, 0b000, "amocas.b", GPR>; defm AMOCAS_H : AMO_cas_aq_rl<0b00101, 0b001, "amocas.h", GPR>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td index 1deecd2..5f944034 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td @@ -30,21 +30,22 @@ class SRL_r<bit aq, bit rl, bits<3> funct3, string opcodestr> opcodestr, "$rs2, $rs1"> { let rd = 0; } + multiclass LAQ_r_aq_rl<bits<3> funct3, string opcodestr> { - def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">; - def _AQ_RL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">; + def _AQ : LAQ_r<1, 0, funct3, opcodestr # ".aq">; + def _AQRL : LAQ_r<1, 1, funct3, opcodestr # ".aqrl">; } multiclass SRL_r_aq_rl<bits<3> funct3, string opcodestr> { - def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">; - def _AQ_RL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">; + def _RL : SRL_r<0, 1, funct3, opcodestr # ".rl">; + def _AQRL : SRL_r<1, 1, funct3, opcodestr # ".aqrl">; } //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZalasr] in { +let Predicates = [HasStdExtZalasr], IsSignExtendingOpW = 1 in { defm LB : LAQ_r_aq_rl<0b000, "lb">; defm LH : LAQ_r_aq_rl<0b001, "lh">; defm LW : LAQ_r_aq_rl<0b010, "lw">; @@ -93,11 +94,12 @@ let Predicates = [HasStdExtZalasr] in { def : PatSRL<releasing_store<atomic_store_32>, SW_RL>; def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>; +} - // Used by GISel for RV32 and RV64. +let Predicates = [HasStdExtZalasr, IsRV32] in { def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>; def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>; -} // Predicates = [HasStdExtZalasr] +} // Predicates = [HasStdExtZalasr, IsRV32] let Predicates = [HasStdExtZalasr, IsRV64] in { def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3f2e7db..3e07eff 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -567,9 +567,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX8", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX8", [VCQ, VL], mx, IsWorstCase>; @@ -587,9 +590,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX16", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX16", [VCQ, VL], mx, IsWorstCase>; @@ -604,9 +610,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX32", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX32", [VCQ, VL], mx, IsWorstCase>; @@ -621,9 +630,12 @@ multiclass SiFive7WriteResBase<int VLEN, defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c; defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, VLEN>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c; - defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, [VCQ, VL], - 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), - [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>; + defm : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, + // Predicated + [VCQ, VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], + // Not Predicated + [VCQ, VL], !add(3, Cycles), [0, 1], [1, !add(1, Cycles)], + mx, IsWorstCase>; let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVLDUX64", [VCQ, VL], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVLDOX64", [VCQ, VL], mx, IsWorstCase>; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 6c7658c..01a4308 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -67,42 +67,41 @@ multiclass LMULSEWWriteResMXSEW<string name, list<ProcResourceKind> resources, // ReleaseAtCycles predCycles if the SchedPredicate Pred is true, otherwise has // Latency noPredLat and ReleaseAtCycles noPredCycles. The WorstCase SchedWrite // is created similarly if IsWorstCase is true. -multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, - list<ProcResourceKind> resources, - int predLat, list<int> predAcquireCycles, - list<int> predReleaseCycles, int noPredLat, - list<int> noPredAcquireCycles, - list<int> noPredReleaseCycles, - string mx, bit IsWorstCase> { - defvar nameMX = name # "_" # mx; - +multiclass LMULWriteResVariantImpl<string name, string writeResName, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + bit IsWorstCase> { // Define the different behaviors - def nameMX # "_Pred" : SchedWriteRes<resources>{ + def writeResName # "_Pred" : SchedWriteRes<predResources>{ let Latency = predLat; let AcquireAtCycles = predAcquireCycles; let ReleaseAtCycles = predReleaseCycles; } - def nameMX # "_NoPred" : SchedWriteRes<resources> { + def writeResName # "_NoPred" : SchedWriteRes<noPredResources> { let Latency = noPredLat; let AcquireAtCycles = noPredAcquireCycles; let ReleaseAtCycles = noPredReleaseCycles; } // Define SchedVars - def nameMX # PredSchedVar - : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>; - def nameMX # NoPredSchedVar - : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>; + def writeResName # PredSchedVar + : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # writeResName # "_Pred")]>; + def writeResName # NoPredSchedVar + : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # writeResName #"_NoPred")]>; // Allow multiclass to refer to SchedVars -- need to have NAME prefix. - defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar); - defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar); + defvar PredSchedVar = !cast<SchedVar>(NAME # writeResName # PredSchedVar); + defvar NoPredSchedVar = !cast<SchedVar>(NAME # writeResName # NoPredSchedVar); // Tie behavior to predicate - def NAME # nameMX # "_Variant" + def NAME # writeResName # "_Variant" : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>; def : SchedAlias< - !cast<SchedReadWrite>(nameMX), - !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>; + !cast<SchedReadWrite>(writeResName), + !cast<SchedReadWrite>(NAME # writeResName # "_Variant")>; if IsWorstCase then { def NAME # name # "_WorstCase_Variant" @@ -113,6 +112,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, } } +multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + string mx, bit IsWorstCase> { + defm "" : LMULWriteResVariantImpl<name, name # "_" # mx, Pred, predResources, + predLat, predAcquireCycles, + predReleaseCycles, noPredResources, + noPredLat, noPredAcquireCycles, + noPredReleaseCycles, + IsWorstCase>; +} + // Define multiclasses to define SchedWrite, SchedRead, WriteRes, and // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the // SchedMxList variants above. Each multiclass is responsible for defining diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index fc14a03..f7be2a1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -32,7 +32,9 @@ class SPIRVLegalizeImplicitBinding : public ModulePass { public: static char ID; SPIRVLegalizeImplicitBinding() : ModulePass(ID) {} - + StringRef getPassName() const override { + return "SPIRV Legalize Implicit Binding"; + } bool runOnModule(Module &M) override; private: diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index 8e7e2e5..f1d487c87 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -326,6 +326,15 @@ void SparcAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) { Sparc_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + emitInstruction(&*I); + ++I; + } + return; + } switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index a160709..cbb7db6 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -3557,3 +3558,28 @@ void SparcTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (!Node->hasAnyUseOfValue(0)) MI.getOperand(0).setReg(SP::G0); } + +Instruction *SparcTargetLowering::emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + bool HasStoreSemantics = + isa<AtomicCmpXchgInst, AtomicRMWInst, StoreInst>(Inst); + if (HasStoreSemantics && isReleaseOrStronger(Ord)) + return Builder.CreateFence(AtomicOrdering::Release); + return nullptr; +} + +Instruction *SparcTargetLowering::emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + // V8 loads already come with implicit acquire barrier so there's no need to + // emit it again. + bool HasLoadSemantics = isa<AtomicCmpXchgInst, AtomicRMWInst, LoadInst>(Inst); + if (Subtarget->isV9() && HasLoadSemantics && isAcquireOrStronger(Ord)) + return Builder.CreateFence(AtomicOrdering::Acquire); + + // SC plain stores would need a trailing full barrier. + if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) + return Builder.CreateFence(Ord); + return nullptr; +} diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index e7040f7..f3efd94 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -183,6 +183,11 @@ namespace llvm { bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override { // FIXME: We insert fences for each atomics and generate // sub-optimal code for PSO/TSO. (Approximately nobody uses any diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index e28f445..f66eb9d 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -653,6 +653,23 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addImm(Offset); return true; } + case SP::V8BAR: { + assert(!Subtarget.isV9() && + "V8BAR should not be emitted on V9 processors!"); + + // Emit stbar; ldstub [%sp-1], %g0 + // The sequence acts as a full barrier on V8 systems. + MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr &InstSTBAR = + *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::STBAR)); + MachineInstr &InstLDSTUB = + *BuildMI(MBB, MI, MI.getDebugLoc(), get(SP::LDSTUBri), SP::G0) + .addReg(SP::O6) + .addImm(-1); + MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB); + MBB.erase(MI); + return true; + } } return false; } diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index 97e7fd7..bc192c2 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -578,6 +578,9 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> let isPseudo = 1; } +// Full memory barrier for V8. +def V8BAR : Pseudo<(outs), (ins), "!V8BAR", []>, Requires<[HasNoV9]>; + // GETPCX for PIC let Defs = [O7] in { def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >; @@ -1974,12 +1977,30 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>; def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>; def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>; -// store bar for all atomic_fence in V8. -let Predicates = [HasNoV9] in - def : Pat<(atomic_fence timm, timm), (STBAR)>; +// All load-type operations in V8 comes with implicit acquire semantics. +let Predicates = [HasNoV9] in { + // Acquire -> nop + def : Pat<(atomic_fence (i32 4), timm), (NOP)>; + // Release / AcqRel -> stbar + def : Pat<(atomic_fence (i32 5), timm), (STBAR)>; + // AcqRel and stronger -> stbar; ldstub [%sp-1], %g0 + def : Pat<(atomic_fence timm, timm), (V8BAR)>; +} -let Predicates = [HasV9] in +// We have to handle both 32 and 64-bit cases. +let Predicates = [HasV9] in { + // Acquire -> membar #LoadLoad | #LoadStore + def : Pat<(atomic_fence (i32 4), timm), (MEMBARi 0x5)>; + def : Pat<(atomic_fence (i64 4), timm), (MEMBARi 0x5)>; + // Release -> membar #LoadStore | #StoreStore + def : Pat<(atomic_fence (i32 5), timm), (MEMBARi 0xc)>; + def : Pat<(atomic_fence (i64 5), timm), (MEMBARi 0xc)>; + // AcqRel -> membar #LoadLoad | #LoadStore | #StoreStore + def : Pat<(atomic_fence (i32 6), timm), (MEMBARi 0xd)>; + def : Pat<(atomic_fence (i64 6), timm), (MEMBARi 0xd)>; + // SeqCst -> membar #StoreLoad | #LoadLoad | #LoadStore | #StoreStore def : Pat<(atomic_fence timm, timm), (MEMBARi 0xf)>; +} // atomic_load addr -> load addr def : Pat<(i32 (atomic_load_azext_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 163bf9b..6472334 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3209,7 +3209,8 @@ static SDValue performAnyAllCombine(SDNode *N, SelectionDAG &DAG) { using namespace llvm::SDPatternMatch; SDValue LHS; - if (!sd_match(N->getOperand(1), + if (N->getNumOperands() < 2 || + !sd_match(N->getOperand(1), m_c_SetCC(m_Value(LHS), m_Zero(), m_CondCode()))) return SDValue(); EVT LT = LHS.getValueType(); |