diff options
Diffstat (limited to 'llvm/lib')
32 files changed, 521 insertions, 364 deletions
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 63e1b14..6f6776c 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6351,19 +6351,20 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) { return getGEPExpr(GEP, IndexExprs); } -APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { +APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S, + const Instruction *CtxI) { uint64_t BitWidth = getTypeSizeInBits(S->getType()); auto GetShiftedByZeros = [BitWidth](uint32_t TrailingZeros) { return TrailingZeros >= BitWidth ? APInt::getZero(BitWidth) : APInt::getOneBitSet(BitWidth, TrailingZeros); }; - auto GetGCDMultiple = [this](const SCEVNAryExpr *N) { + auto GetGCDMultiple = [this, CtxI](const SCEVNAryExpr *N) { // The result is GCD of all operands results. - APInt Res = getConstantMultiple(N->getOperand(0)); + APInt Res = getConstantMultiple(N->getOperand(0), CtxI); for (unsigned I = 1, E = N->getNumOperands(); I < E && Res != 1; ++I) Res = APIntOps::GreatestCommonDivisor( - Res, getConstantMultiple(N->getOperand(I))); + Res, getConstantMultiple(N->getOperand(I), CtxI)); return Res; }; @@ -6371,33 +6372,33 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { case scConstant: return cast<SCEVConstant>(S)->getAPInt(); case scPtrToInt: - return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand()); + return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand(), CtxI); case scUDivExpr: case scVScale: return APInt(BitWidth, 1); case scTruncate: { // Only multiples that are a power of 2 will hold after truncation. const SCEVTruncateExpr *T = cast<SCEVTruncateExpr>(S); - uint32_t TZ = getMinTrailingZeros(T->getOperand()); + uint32_t TZ = getMinTrailingZeros(T->getOperand(), CtxI); return GetShiftedByZeros(TZ); } case scZeroExtend: { const SCEVZeroExtendExpr *Z = cast<SCEVZeroExtendExpr>(S); - return getConstantMultiple(Z->getOperand()).zext(BitWidth); + return getConstantMultiple(Z->getOperand(), CtxI).zext(BitWidth); } case scSignExtend: { // Only multiples that are a power of 2 will hold after sext. const SCEVSignExtendExpr *E = cast<SCEVSignExtendExpr>(S); - uint32_t TZ = getMinTrailingZeros(E->getOperand()); + uint32_t TZ = getMinTrailingZeros(E->getOperand(), CtxI); return GetShiftedByZeros(TZ); } case scMulExpr: { const SCEVMulExpr *M = cast<SCEVMulExpr>(S); if (M->hasNoUnsignedWrap()) { // The result is the product of all operand results. - APInt Res = getConstantMultiple(M->getOperand(0)); + APInt Res = getConstantMultiple(M->getOperand(0), CtxI); for (const SCEV *Operand : M->operands().drop_front()) - Res = Res * getConstantMultiple(Operand); + Res = Res * getConstantMultiple(Operand, CtxI); return Res; } @@ -6405,7 +6406,7 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { // sum of trailing zeros for all its operands. uint32_t TZ = 0; for (const SCEV *Operand : M->operands()) - TZ += getMinTrailingZeros(Operand); + TZ += getMinTrailingZeros(Operand, CtxI); return GetShiftedByZeros(TZ); } case scAddExpr: @@ -6414,9 +6415,9 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { if (N->hasNoUnsignedWrap()) return GetGCDMultiple(N); // Find the trailing bits, which is the minimum of its operands. - uint32_t TZ = getMinTrailingZeros(N->getOperand(0)); + uint32_t TZ = getMinTrailingZeros(N->getOperand(0), CtxI); for (const SCEV *Operand : N->operands().drop_front()) - TZ = std::min(TZ, getMinTrailingZeros(Operand)); + TZ = std::min(TZ, getMinTrailingZeros(Operand, CtxI)); return GetShiftedByZeros(TZ); } case scUMaxExpr: @@ -6429,7 +6430,7 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { // ask ValueTracking for known bits const SCEVUnknown *U = cast<SCEVUnknown>(S); unsigned Known = - computeKnownBits(U->getValue(), getDataLayout(), &AC, nullptr, &DT) + computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT) .countMinTrailingZeros(); return GetShiftedByZeros(Known); } @@ -6439,12 +6440,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { llvm_unreachable("Unknown SCEV kind!"); } -APInt ScalarEvolution::getConstantMultiple(const SCEV *S) { +APInt ScalarEvolution::getConstantMultiple(const SCEV *S, + const Instruction *CtxI) { + // Skip looking up and updating the cache if there is a context instruction, + // as the result will only be valid in the specified context. + if (CtxI) + return getConstantMultipleImpl(S, CtxI); + auto I = ConstantMultipleCache.find(S); if (I != ConstantMultipleCache.end()) return I->second; - APInt Result = getConstantMultipleImpl(S); + APInt Result = getConstantMultipleImpl(S, CtxI); auto InsertPair = ConstantMultipleCache.insert({S, Result}); assert(InsertPair.second && "Should insert a new key"); return InsertPair.first->second; @@ -6455,8 +6462,9 @@ APInt ScalarEvolution::getNonZeroConstantMultiple(const SCEV *S) { return Multiple == 0 ? APInt(Multiple.getBitWidth(), 1) : Multiple; } -uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S) { - return std::min(getConstantMultiple(S).countTrailingZeros(), +uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S, + const Instruction *CtxI) { + return std::min(getConstantMultiple(S, CtxI).countTrailingZeros(), (unsigned)getTypeSizeInBits(S->getType())); } @@ -10243,8 +10251,7 @@ const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const { static const SCEV * SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, SmallVectorImpl<const SCEVPredicate *> *Predicates, - - ScalarEvolution &SE) { + ScalarEvolution &SE, const Loop *L) { uint32_t BW = A.getBitWidth(); assert(BW == SE.getTypeSizeInBits(B->getType())); assert(A != 0 && "A must be non-zero."); @@ -10260,7 +10267,12 @@ SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, // // B is divisible by D if and only if the multiplicity of prime factor 2 for B // is not less than multiplicity of this prime factor for D. - if (SE.getMinTrailingZeros(B) < Mult2) { + unsigned MinTZ = SE.getMinTrailingZeros(B); + // Try again with the terminator of the loop predecessor for context-specific + // result, if MinTZ s too small. + if (MinTZ < Mult2 && L->getLoopPredecessor()) + MinTZ = SE.getMinTrailingZeros(B, L->getLoopPredecessor()->getTerminator()); + if (MinTZ < Mult2) { // Check if we can prove there's no remainder using URem. const SCEV *URem = SE.getURemExpr(B, SE.getConstant(APInt::getOneBitSet(BW, Mult2))); @@ -10708,7 +10720,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V, return getCouldNotCompute(); const SCEV *E = SolveLinEquationWithOverflow( StepC->getAPInt(), getNegativeSCEV(Start), - AllowPredicates ? &Predicates : nullptr, *this); + AllowPredicates ? &Predicates : nullptr, *this, L); const SCEV *M = E; if (E != getCouldNotCompute()) { diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 0ebee2c..fa0ccd6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6745,6 +6745,73 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI, return MatchNaN(1) || MatchNaN(2); } +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +// E.g., (a / Y; b / Y;) -> (recip = 1.0 / Y; a * recip; b * recip) +bool CombinerHelper::matchRepeatedFPDivisor( + MachineInstr &MI, SmallVector<MachineInstr *> &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_FDIV); + + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + + if (!MI.getFlag(MachineInstr::MIFlag::FmArcp)) + return false; + + // Skip if current node is a reciprocal/fneg-reciprocal. + auto N0CFP = isConstantOrConstantSplatVectorFP(*MRI.getVRegDef(X), MRI); + if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) + return false; + + // Exit early if the target does not want this transform or if there can't + // possibly be enough uses of the divisor to make the transform worthwhile. + unsigned MinUses = getTargetLowering().combineRepeatedFPDivisors(); + if (!MinUses) + return false; + + // Find all FDIV users of the same divisor. For the moment we limit all + // instructions to a single BB and use the first Instr in MatchInfo as the + // dominating position. + MatchInfo.push_back(&MI); + for (auto &U : MRI.use_nodbg_instructions(Y)) { + if (&U == &MI || U.getParent() != MI.getParent()) + continue; + if (U.getOpcode() == TargetOpcode::G_FDIV && + U.getOperand(2).getReg() == Y && U.getOperand(1).getReg() != Y) { + // This division is eligible for optimization only if global unsafe math + // is enabled or if this division allows reciprocal formation. + if (U.getFlag(MachineInstr::MIFlag::FmArcp)) { + MatchInfo.push_back(&U); + if (dominates(U, *MatchInfo[0])) + std::swap(MatchInfo[0], MatchInfo.back()); + } + } + } + + // Now that we have the actual number of divisor uses, make sure it meets + // the minimum threshold specified by the target. + return MatchInfo.size() >= MinUses; +} + +void CombinerHelper::applyRepeatedFPDivisor( + SmallVector<MachineInstr *> &MatchInfo) const { + // Generate the new div at the position of the first instruction, that we have + // ensured will dominate all other instructions. + Builder.setInsertPt(*MatchInfo[0]->getParent(), MatchInfo[0]); + LLT Ty = MRI.getType(MatchInfo[0]->getOperand(0).getReg()); + auto Div = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), + MatchInfo[0]->getOperand(2).getReg(), + MatchInfo[0]->getFlags()); + + // Replace all found div's with fmul instructions. + for (MachineInstr *MI : MatchInfo) { + Builder.setInsertPt(*MI->getParent(), MI); + Builder.buildFMul(MI->getOperand(0).getReg(), MI->getOperand(1).getReg(), + Div->getOperand(0).getReg(), MI->getFlags()); + MI->eraseFromParent(); + } +} + bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const { assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); Register LHS = MI.getOperand(1).getReg(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 309f1be..c5c3866 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19319,9 +19319,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't // know it was called from a context with a nsz flag if the input fsub does // not. - if (N0.getOpcode() == ISD::FSUB && - (DAG.getTarget().Options.NoSignedZerosFPMath || - N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { + if (N0.getOpcode() == ISD::FSUB && N->getFlags().hasNoSignedZeros() && + N0.hasOneUse()) { return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), N0.getOperand(0)); } diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp index d3c5761..b8d71c5 100644 --- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp @@ -361,6 +361,7 @@ COFFPlatform::standardRuntimeUtilityAliases() { {"__orc_rt_run_program", "__orc_rt_coff_run_program"}, {"__orc_rt_jit_dlerror", "__orc_rt_coff_jit_dlerror"}, {"__orc_rt_jit_dlopen", "__orc_rt_coff_jit_dlopen"}, + {"__orc_rt_jit_dlupdate", "__orc_rt_coff_jit_dlupdate"}, {"__orc_rt_jit_dlclose", "__orc_rt_coff_jit_dlclose"}, {"__orc_rt_jit_dlsym", "__orc_rt_coff_jit_dlsym"}, {"__orc_rt_log_error", "__orc_rt_log_error_to_stderr"}}; diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 67bb7dd..7487526 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -617,14 +617,11 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) { [](const JITDylibSearchOrder &SO) { return SO; }); StringRef WrapperToCall = "__orc_rt_jit_dlopen_wrapper"; bool dlupdate = false; - const Triple &TT = ES.getTargetTriple(); - if (TT.isOSBinFormatMachO() || TT.isOSBinFormatELF()) { - if (InitializedDylib.contains(&JD)) { - WrapperToCall = "__orc_rt_jit_dlupdate_wrapper"; - dlupdate = true; - } else - InitializedDylib.insert(&JD); - } + if (InitializedDylib.contains(&JD)) { + WrapperToCall = "__orc_rt_jit_dlupdate_wrapper"; + dlupdate = true; + } else + InitializedDylib.insert(&JD); if (auto WrapperAddr = ES.lookup(MainSearchOrder, J.mangleAndIntern(WrapperToCall))) { diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 076a623..639ddcb 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -69,7 +69,6 @@ def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, - fconstant_to_constant, icmp_redundant_trunc, fold_global_offset, shuffle_to_extract, @@ -341,7 +340,7 @@ def AArch64PostLegalizerLowering : GICombiner<"AArch64PostLegalizerLoweringImpl", [shuffle_vector_lowering, vashr_vlshr_imm, icmp_lowering, build_vector_lowering, - lower_vector_fcmp, form_truncstore, + lower_vector_fcmp, form_truncstore, fconstant_to_constant, vector_sext_inreg_to_shift, unmerge_ext_to_unmerge, lower_mulv2s64, vector_unmerge_lowering, insertelt_nonconst, diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 0f4bbfc3..1e607f4 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -92,9 +92,18 @@ private: bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - MachineBasicBlock * - expandCommitOrRestoreZASave(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + struct ConditionalBlocks { + MachineBasicBlock &CondBB; + MachineBasicBlock &EndBB; + }; + ConditionalBlocks expandConditionalPseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + MachineInstrBuilder &Branch); + MachineBasicBlock *expandRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + MachineBasicBlock *expandCommitZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; @@ -991,72 +1000,97 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext( return true; } -static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; - -MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { - MachineInstr &MI = *MBBI; - bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo; - assert((MI.getOpcode() == AArch64::RestoreZAPseudo || - MI.getOpcode() == AArch64::CommitZASavePseudo) && - "Expected ZA commit or restore"); +AArch64ExpandPseudo::ConditionalBlocks +AArch64ExpandPseudo::expandConditionalPseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + MachineInstrBuilder &Branch) { assert((std::next(MBBI) != MBB.end() || - MI.getParent()->successors().begin() != - MI.getParent()->successors().end()) && - "Unexpected unreachable in block that restores ZA"); - - // Compare TPIDR2_EL0 value against 0. - DebugLoc DL = MI.getDebugLoc(); - MachineInstrBuilder Branch = - BuildMI(MBB, MBBI, DL, - TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX)) - .add(MI.getOperand(0)); + MBB.successors().begin() != MBB.successors().end()) && + "Unexpected unreachable in block"); // Split MBB and create two new blocks: - // - MBB now contains all instructions before RestoreZAPseudo. - // - SMBB contains the [Commit|RestoreZA]Pseudo instruction only. - // - EndBB contains all instructions after [Commit|RestoreZA]Pseudo. + // - MBB now contains all instructions before the conditional pseudo. + // - CondBB contains the conditional pseudo instruction only. + // - EndBB contains all instructions after the conditional pseudo. MachineInstr &PrevMI = *std::prev(MBBI); - MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); - MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() - ? *SMBB->successors().begin() - : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - - // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB. - Branch.addMBB(SMBB); + MachineBasicBlock *CondBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); + MachineBasicBlock *EndBB = + std::next(MBBI) == CondBB->end() + ? *CondBB->successors().begin() + : CondBB->splitAt(*MBBI, /*UpdateLiveIns*/ true); + + // Add the SMBB label to the branch instruction & create a branch to EndBB. + Branch.addMBB(CondBB); BuildMI(&MBB, DL, TII->get(AArch64::B)) .addMBB(EndBB); MBB.addSuccessor(EndBB); + // Create branch from CondBB to EndBB. Users of this helper should insert new + // instructions at CondBB.back() -- i.e. before the branch. + BuildMI(CondBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + return {*CondBB, *EndBB}; +} + +MachineBasicBlock * +AArch64ExpandPseudo::expandRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + // Compare TPIDR2_EL0 against 0. Restore ZA if TPIDR2_EL0 is zero. + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX)).add(MI.getOperand(0)); + + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch); // Replace the pseudo with a call (BL). MachineInstrBuilder MIB = - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL)); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); // Copy operands (mainly the regmask) from the pseudo. for (unsigned I = 2; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); + // Mark the TPIDR2 block pointer (X0) as an implicit use. + MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); - if (IsRestoreZA) { - // Mark the TPIDR2 block pointer (X0) as an implicit use. - MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); - } else /*CommitZA*/ { + MI.eraseFromParent(); + return &EndBB; +} + +static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; + +MachineBasicBlock * +AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + + // Compare TPIDR2_EL0 against 0. Commit ZA if TPIDR2_EL0 is non-zero. + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBNZX)).add(MI.getOperand(0)); + + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch); + // Replace the pseudo with a call (BL). + MachineInstrBuilder MIB = + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); + // Copy operands (mainly the regmask) from the pseudo. + for (unsigned I = 2; I < MI.getNumOperands(); ++I) + MIB.add(MI.getOperand(I)); + // Clear TPIDR2_EL0. + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + bool ZeroZA = MI.getOperand(1).getImm() != 0; + if (ZeroZA) { [[maybe_unused]] auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); - // Clear TPIDR2_EL0. - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR)) - .addImm(AArch64SysReg::TPIDR2_EL0) - .addReg(AArch64::XZR); - bool ZeroZA = MI.getOperand(1).getImm() != 0; - if (ZeroZA) { - assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); - BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M)) - .addImm(ZERO_ALL_ZA_MASK) - .addDef(AArch64::ZAB0, RegState::ImplicitDefine); - } + assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_M)) + .addImm(ZERO_ALL_ZA_MASK) + .addDef(AArch64::ZAB0, RegState::ImplicitDefine); } - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MI.eraseFromParent(); - return EndBB; + return &EndBB; } MachineBasicBlock * @@ -1130,24 +1164,9 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB, MachineInstrBuilder Tbx = BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0); - // Split MBB and create two new blocks: - // - MBB now contains all instructions before MSRcond_pstatesvcrImm1. - // - SMBB contains the MSRcond_pstatesvcrImm1 instruction only. - // - EndBB contains all instructions after MSRcond_pstatesvcrImm1. - MachineInstr &PrevMI = *std::prev(MBBI); - MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); - MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() - ? *SMBB->successors().begin() - : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - - // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB. - Tbx.addMBB(SMBB); - BuildMI(&MBB, DL, TII->get(AArch64::B)) - .addMBB(EndBB); - MBB.addSuccessor(EndBB); - + auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Tbx); // Create the SMSTART/SMSTOP (MSRpstatesvcrImm1) instruction in SMBB. - MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->begin(), MI.getDebugLoc(), + MachineInstrBuilder MIB = BuildMI(CondBB, CondBB.back(), MI.getDebugLoc(), TII->get(AArch64::MSRpstatesvcrImm1)); // Copy all but the second and third operands of MSRcond_pstatesvcrImm1 (as // these contain the CopyFromReg for the first argument and the flag to @@ -1157,10 +1176,8 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB, for (unsigned i = 4; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); - MI.eraseFromParent(); - return EndBB; + return &EndBB; } bool AArch64ExpandPseudo::expandMultiVecPseudo( @@ -1674,15 +1691,21 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandCALL_BTI(MBB, MBBI); case AArch64::StoreSwiftAsyncContext: return expandStoreSwiftAsyncContext(MBB, MBBI); + case AArch64::RestoreZAPseudo: case AArch64::CommitZASavePseudo: - case AArch64::RestoreZAPseudo: { - auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI); - if (NewMBB != &MBB) - NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. - return true; - } case AArch64::MSRpstatePseudo: { - auto *NewMBB = expandCondSMToggle(MBB, MBBI); + auto *NewMBB = [&] { + switch (Opcode) { + case AArch64::RestoreZAPseudo: + return expandRestoreZASave(MBB, MBBI); + case AArch64::CommitZASavePseudo: + return expandCommitZASave(MBB, MBBI); + case AArch64::MSRpstatePseudo: + return expandCondSMToggle(MBB, MBBI); + default: + llvm_unreachable("Unexpected conditional pseudo!"); + } + }(); if (NewMBB != &MBB) NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index c197550e..9e2d698 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64); getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({s32, s64, s128}) - .legalFor(HasFP16, {s16}) + // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT + .legalFor({s16, s32, s64, s128}) .clampScalar(0, MinFPScalar, s128); // FIXME: fix moreElementsToNextPow2 diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 63313da..23dcaea 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -75,6 +75,31 @@ struct ShuffleVectorPseudo { ShuffleVectorPseudo() = default; }; +/// Return true if a G_FCONSTANT instruction is known to be better-represented +/// as a G_CONSTANT. +bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + Register DstReg = MI.getOperand(0).getReg(); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + if (DstSize != 16 && DstSize != 32 && DstSize != 64) + return false; + + // When we're storing a value, it doesn't matter what register bank it's on. + // Since not all floating point constants can be materialized using a fmov, + // it makes more sense to just use a GPR. + return all_of(MRI.use_nodbg_instructions(DstReg), + [](const MachineInstr &Use) { return Use.mayStore(); }); +} + +/// Change a G_FCONSTANT into a G_CONSTANT. +void applyFConstantToConstant(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); + MachineIRBuilder MIB(MI); + const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); + MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); + MI.eraseFromParent(); +} + /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector /// sources of the shuffle are different. std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 8c10673..896eab5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -44,31 +44,6 @@ namespace { #include "AArch64GenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES -/// Return true if a G_FCONSTANT instruction is known to be better-represented -/// as a G_CONSTANT. -bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - Register DstReg = MI.getOperand(0).getReg(); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - if (DstSize != 32 && DstSize != 64) - return false; - - // When we're storing a value, it doesn't matter what register bank it's on. - // Since not all floating point constants can be materialized using a fmov, - // it makes more sense to just use a GPR. - return all_of(MRI.use_nodbg_instructions(DstReg), - [](const MachineInstr &Use) { return Use.mayStore(); }); -} - -/// Change a G_FCONSTANT into a G_CONSTANT. -void applyFConstantToConstant(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); - MachineIRBuilder MIB(MI); - const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); - MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); - MI.eraseFromParent(); -} - /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits /// are sign bits. In this case, we can transform the G_ICMP to directly compare /// the wide value with a zero. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 723d07e..c7a91f4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,7 +929,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - if (getTargetTriple().isAMDGCN()) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opts; MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); } @@ -966,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } - if (EnableAMDGPUAttributor) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a8140c3..99ba043 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2105,6 +2105,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { // Only plain immediates are inlinable (e.g. "clamp" attribute is not) return false; } + + if (getModifiers().Lit != LitModifier::None) + return false; + // TODO: We should avoid using host float here. It would be better to // check the float bit values which is what a few other places do. // We've had bot failures before due to weird NaN support on mips hosts. @@ -2339,6 +2343,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo bool CanUse64BitLiterals = AsmParser->has64BitLiterals() && !(InstDesc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); + LitModifier Lit = getModifiers().Lit; MCContext &Ctx = AsmParser->getContext(); if (Imm.IsFPImm) { // We got fp literal token @@ -2348,7 +2353,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); return; @@ -2372,14 +2378,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo if ((OpTy == AMDGPU::OPERAND_REG_IMM_FP64 || OpTy == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64) && - CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); + OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64)) { + if (CanUse64BitLiterals && Lit == LitModifier::None && + (isInt<32>(Val) || isUInt<32>(Val))) { + // The floating-point operand will be verbalized as an + // integer one. If that integer happens to fit 32 bits, on + // re-assembling it will be intepreted as the high half of + // the actual value, so we have to wrap it into lit64(). + Lit = LitModifier::Lit64; + } else if (Lit == LitModifier::Lit) { + // For FP64 operands lit() specifies the high half of the value. + Val = Hi_32(Val); + } } - return; + break; } // We don't allow fp literals in 64-bit integer instructions. It is @@ -2388,19 +2400,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo llvm_unreachable("fp literal in 64-bit integer instruction."); case AMDGPU::OPERAND_KIMM64: - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + if (CanUse64BitLiterals && Lit == LitModifier::None && + (isInt<32>(Val) || isUInt<32>(Val))) + Lit = LitModifier::Lit64; + break; case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2BF16: - if (AsmParser->hasInv2PiInlineImm() && Literal == 0x3fc45f306725feed) { + if (Lit == LitModifier::None && AsmParser->hasInv2PiInlineImm() && + Literal == 0x3fc45f306725feed) { // This is the 1/(2*pi) which is going to be truncated to bf16 with the // loss of precision. The constant represents ideomatic fp32 value of // 1/(2*pi) = 0.15915494 since bf16 is in fact fp32 with cleared low 16 @@ -2438,14 +2448,19 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() - uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); - Inst.addOperand(MCOperand::createImm(ImmVal)); - return; + Val = FPLiteral.bitcastToAPInt().getZExtValue(); + break; } default: llvm_unreachable("invalid operand size"); } + if (Lit != LitModifier::None) { + Inst.addOperand( + MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; } @@ -2465,12 +2480,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: - Inst.addOperand(MCOperand::createImm(Val)); - return; + break; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -2479,22 +2494,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // truncated to uint32_t), if the target doesn't support 64-bit literals, or // the lit modifier is explicitly used, we need to truncate it to the 32 // LSBs. - if (!AsmParser->has64BitLiterals() || - getModifiers().Lit == LitModifier::Lit) + if (!AsmParser->has64BitLiterals() || Lit == LitModifier::Lit) Val = Lo_32(Val); - - if (CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val))) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + break; case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { + if (Lit == LitModifier::None && + AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -2509,19 +2517,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // 1) explicitly forced by using lit modifier; // 2) the value is a valid 32-bit representation (signed or unsigned), // meanwhile not forced by lit64 modifier. - if (getModifiers().Lit == LitModifier::Lit || - (getModifiers().Lit != LitModifier::Lit64 && - (isInt<32>(Val) || isUInt<32>(Val)))) + if (Lit == LitModifier::Lit || + (Lit != LitModifier::Lit64 && (isInt<32>(Val) || isUInt<32>(Val)))) Val = static_cast<uint64_t>(Val) << 32; } - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + // For FP64 operands lit() specifies the high half of the value. + if (Lit == LitModifier::Lit) + Val = Hi_32(Val); + break; case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: @@ -2534,25 +2538,23 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: - Inst.addOperand(MCOperand::createImm(Val)); - return; + break; case AMDGPU::OPERAND_KIMM64: - if ((isInt<32>(Val) || isUInt<32>(Val)) && - getModifiers().Lit != LitModifier::Lit64) + if ((isInt<32>(Val) || isUInt<32>(Val)) && Lit != LitModifier::Lit64) Val <<= 32; - - if (CanUse64BitLiterals && Lo_32(Val) != 0) { - Inst.addOperand(MCOperand::createExpr( - AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); - } else { - Inst.addOperand(MCOperand::createImm(Val)); - } - return; + break; default: llvm_unreachable("invalid operand type"); } + + if (Lit != LitModifier::None) { + Inst.addOperand( + MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } } void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { @@ -4821,12 +4823,15 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst, const MCOperand &MO = Inst.getOperand(OpIdx); // Exclude special imm operands (like that used by s_set_gpr_idx_on) if (AMDGPU::isSISrcOperand(Desc, OpIdx)) { + bool IsLit = false; std::optional<int64_t> Imm; if (MO.isImm()) { Imm = MO.getImm(); } else if (MO.isExpr()) { - if (isLitExpr(MO.getExpr())) + if (isLitExpr(MO.getExpr())) { + IsLit = true; Imm = getLitValue(MO.getExpr()); + } } else { continue; } @@ -4836,7 +4841,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst, } else if (!isInlineConstant(Inst, OpIdx)) { auto OpType = static_cast<AMDGPU::OperandType>( Desc.operands()[OpIdx].OperandType); - int64_t Value = encode32BitLiteral(*Imm, OpType); + int64_t Value = encode32BitLiteral(*Imm, OpType, IsLit); if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index f11b373..be62395 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1551,7 +1551,7 @@ AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const { HasLiteral = true; Literal = Literal64 = Val; - bool UseLit64 = Lo_32(Literal64) != 0; + bool UseLit64 = Hi_32(Literal64) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( LitModifier::Lit64, Literal64, getContext())) : MCOperand::createImm(Literal64); @@ -1584,11 +1584,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, if (CanUse64BitLiterals) { if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) - UseLit64 = !isInt<32>(Val) || !isUInt<32>(Val); + UseLit64 = false; else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64) - UseLit64 = Lo_32(Val) != 0; + UseLit64 = Hi_32(Literal64) == 0; } return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( @@ -1614,12 +1614,12 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()]; if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) { - UseLit64 = !isInt<32>(Literal64) || !isUInt<32>(Literal64); + UseLit64 = false; } else { assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64); - UseLit64 = Lo_32(Literal64) != 0; + UseLit64 = Hi_32(Literal64) == 0; } return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e82f998..703ec0a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -73,7 +73,13 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isExpr()) { + MAI.printExpr(O, *Op.getExpr()); + return; + } + + O << formatHex(Op.getImm() & 0xffffffff); } void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f2879116..ea758bb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -270,10 +270,19 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo, const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const { const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int64_t Imm; + int64_t Imm = 0; if (MO.isExpr()) { - if (!MO.getExpr()->evaluateAsAbsolute(Imm)) - return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255; + if (!MO.getExpr()->evaluateAsAbsolute(Imm) || + AMDGPU::isLitExpr(MO.getExpr())) { + if (OpInfo.OperandType == AMDGPU::OPERAND_KIMM16 || + OpInfo.OperandType == AMDGPU::OPERAND_KIMM32 || + OpInfo.OperandType == AMDGPU::OPERAND_KIMM64) + return Imm; + if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && + AMDGPU::getOperandSize(OpInfo) == 8) + return 254; + return 255; + } } else { assert(!MO.isDFPImm()); @@ -452,13 +461,16 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // Yes! Encode it int64_t Imm = 0; + bool IsLit = false; if (Op.isImm()) Imm = Op.getImm(); else if (Op.isExpr()) { - if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) + if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) { Imm = C->getValue(); - else if (AMDGPU::isLitExpr(Op.getExpr())) + } else if (AMDGPU::isLitExpr(Op.getExpr())) { + IsLit = true; Imm = AMDGPU::getLitValue(Op.getExpr()); + } } else // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); @@ -468,7 +480,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, } else { auto OpType = static_cast<AMDGPU::OperandType>(Desc.operands()[i].OperandType); - Imm = AMDGPU::encode32BitLiteral(Imm, OpType); + Imm = AMDGPU::encode32BitLiteral(Imm, OpType, IsLit); support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 76023d2..3e1b058 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3145,7 +3145,7 @@ bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { return isUInt<32>(Val) || isInt<32>(Val); } -int64_t encode32BitLiteral(int64_t Imm, OperandType Type) { +int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) { switch (Type) { default: break; @@ -3168,7 +3168,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type) { case OPERAND_REG_INLINE_C_INT32: return Lo_32(Imm); case OPERAND_REG_IMM_FP64: - return Hi_32(Imm); + return IsLit ? Imm : Hi_32(Imm); } return Imm; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 49b4d02..a01a5fd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1727,7 +1727,7 @@ LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); LLVM_READNONE -int64_t encode32BitLiteral(int64_t Imm, OperandType Type); +int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit); bool isArgPassedInSGPR(const Argument *Arg); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 186fdd1..53633ea 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -675,6 +675,45 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC, CC = getRISCVCCFromICmp(Pred); } +/// Select the RISC-V Zalasr opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. +static unsigned selectZalasrLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + default: + llvm_unreachable("Unexpected memory size"); + case 8: + return IsStore ? RISCV::SB_RL : RISCV::LB_AQ; + case 16: + return IsStore ? RISCV::SH_RL : RISCV::LH_AQ; + case 32: + return IsStore ? RISCV::SW_RL : RISCV::LW_AQ; + case 64: + return IsStore ? RISCV::SD_RL : RISCV::LD_AQ; + } +} + +/// Select the RISC-V regimm opcode for the G_LOAD or G_STORE operation +/// \p GenericOpc, appropriate for the GPR register bank and of memory access +/// size \p OpSize. \returns \p GenericOpc if the combination is unsupported. +static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { + const bool IsStore = GenericOpc == TargetOpcode::G_STORE; + switch (OpSize) { + case 8: + // Prefer unsigned due to no c.lb in Zcb. + return IsStore ? RISCV::SB : RISCV::LBU; + case 16: + return IsStore ? RISCV::SH : RISCV::LH; + case 32: + return IsStore ? RISCV::SW : RISCV::LW; + case 64: + return IsStore ? RISCV::SD : RISCV::LD; + } + + return GenericOpc; +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -892,6 +931,59 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return selectImplicitDef(MI, MIB); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(MI, MIB); + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + GLoadStore &LdSt = cast<GLoadStore>(MI); + const Register ValReg = LdSt.getReg(0); + const Register PtrReg = LdSt.getPointerReg(); + LLT PtrTy = MRI->getType(PtrReg); + + const RegisterBank &RB = *RBI.getRegBank(ValReg, *MRI, TRI); + if (RB.getID() != RISCV::GPRBRegBankID) + return false; + +#ifndef NDEBUG + const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, *MRI, TRI); + // Check that the pointer register is valid. + assert(PtrRB.getID() == RISCV::GPRBRegBankID && + "Load/Store pointer operand isn't a GPR"); + assert(PtrTy.isPointer() && "Load/Store pointer operand isn't a pointer"); +#endif + + // Can only handle AddressSpace 0. + if (PtrTy.getAddressSpace() != 0) + return false; + + unsigned MemSize = LdSt.getMemSizeInBits().getValue(); + AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); + + if (isStrongerThanMonotonic(Order)) { + MI.setDesc(TII.get(selectZalasrLoadStoreOp(Opc, MemSize))); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + } + + const unsigned NewOpc = selectRegImmLoadStoreOp(MI.getOpcode(), MemSize); + if (NewOpc == MI.getOpcode()) + return false; + + // Check if we can fold anything into the addressing mode. + auto AddrModeFns = selectAddrRegImm(MI.getOperand(1)); + if (!AddrModeFns) + return false; + + // Folded something. Create a new instruction and return it. + auto NewInst = MIB.buildInstr(NewOpc, {}, {}, MI.getFlags()); + if (isa<GStore>(MI)) + NewInst.addUse(ValReg); + else + NewInst.addDef(ValReg); + NewInst.cloneMemRefs(MI); + for (auto &Fn : *AddrModeFns) + Fn(NewInst); + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); + } default: return false; } diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 1c7cbb9..5dd4bf4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -287,8 +287,8 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -375,8 +375,8 @@ static void doMaskedAtomicBinOpExpansion(const RISCVInstrInfo *TII, ScratchReg); BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -535,8 +535,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( // sc.w scratch1, scratch1, (addr) // bnez scratch1, loop BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), Scratch1Reg) - .addReg(AddrReg) - .addReg(Scratch1Reg); + .addReg(Scratch1Reg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(Scratch1Reg) .addReg(RISCV::X0) @@ -674,8 +674,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( // bnez scratch, loophead BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(NewValReg); + .addReg(NewValReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) @@ -707,8 +707,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( MaskReg, ScratchReg); BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) - .addReg(AddrReg) - .addReg(ScratchReg); + .addReg(ScratchReg) + .addReg(AddrReg); BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) .addReg(ScratchReg) .addReg(RISCV::X0) diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 7dd3385..eba35ef 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -100,65 +100,11 @@ def : LdPat<load, LD, PtrVT>; def : StPat<store, SD, GPR, PtrVT>; } -// Load and store patterns for i16, needed because Zfh makes s16 load/store -// legal and regbank select may not constrain registers to FP. -def : LdPat<load, LH, i16>; -def : StPat<store, SH, GPR, i16>; - -def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb. -def : StPat<truncstorei8, SB, GPR, i16>; - -let Predicates = [HasAtomicLdSt] in { - // Prefer unsigned due to no c.lb in Zcb. - def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>; - def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>; - - def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>; - def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>; -} - -let Predicates = [HasAtomicLdSt, IsRV64] in { - // Load pattern is in RISCVInstrInfoA.td and shared with RV32. - def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>; -} - //===----------------------------------------------------------------------===// // RV64 i32 patterns not used by SelectionDAG //===----------------------------------------------------------------------===// let Predicates = [IsRV64] in { -def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb. -def : LdPat<extloadi16, LH, i32>; - -def : StPat<truncstorei8, SB, GPR, i32>; -def : StPat<truncstorei16, SH, GPR, i32>; - def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32), (ADDIW GPR:$rs1, simm12_lo:$imm)>; } - -//===----------------------------------------------------------------------===// -// Zalasr patterns not used by SelectionDAG -//===----------------------------------------------------------------------===// - -let Predicates = [HasStdExtZalasr] in { - // the sequentially consistent loads use - // .aq instead of .aqrl to match the psABI/A.7 - def : PatLAQ<acquiring_load<atomic_load_aext_8>, LB_AQ, i16>; - def : PatLAQ<seq_cst_load<atomic_load_aext_8>, LB_AQ, i16>; - - def : PatLAQ<acquiring_load<atomic_load_nonext_16>, LH_AQ, i16>; - def : PatLAQ<seq_cst_load<atomic_load_nonext_16>, LH_AQ, i16>; - - def : PatSRL<releasing_store<atomic_store_8>, SB_RL, i16>; - def : PatSRL<seq_cst_store<atomic_store_8>, SB_RL, i16>; - - def : PatSRL<releasing_store<atomic_store_16>, SH_RL, i16>; - def : PatSRL<seq_cst_store<atomic_store_16>, SH_RL, i16>; -} - -let Predicates = [HasStdExtZalasr, IsRV64] in { - // Load pattern is in RISCVInstrInfoZalasr.td and shared with RV32. - def : PatSRL<releasing_store<atomic_store_32>, SW_RL, i32>; - def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL, i32>; -} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 9855c47..7a14929 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1980,7 +1980,7 @@ def : LdPat<sextloadi8, LB>; def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb. def : LdPat<sextloadi16, LH>; def : LdPat<extloadi16, LH>; -def : LdPat<load, LW, i32>; +def : LdPat<load, LW, i32>, Requires<[IsRV32]>; def : LdPat<zextloadi8, LBU>; def : LdPat<zextloadi16, LHU>; @@ -1994,7 +1994,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy, def : StPat<truncstorei8, SB, GPR, XLenVT>; def : StPat<truncstorei16, SH, GPR, XLenVT>; -def : StPat<store, SW, GPR, i32>; +def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>; /// Fences diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 2e4326f..571d72f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -33,7 +33,7 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> { let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in class SC_r<bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<0b00011, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> { @@ -46,7 +46,7 @@ multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> { let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2), + (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> { @@ -174,8 +174,9 @@ let Predicates = [HasAtomicLdSt] in { def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>; +} - // Used by GISel for RV32 and RV64. +let Predicates = [HasAtomicLdSt, IsRV32] in { def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>; } @@ -188,31 +189,34 @@ let Predicates = [HasAtomicLdSt, IsRV64] in { /// AMOs +class PatAMO<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT> + : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs2, GPR:$rs1)>; + multiclass AMOPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtA, NoStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst#"_AQ"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst#"_RL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst#"_AQRL"), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst#"_AQRL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst#"_AQ"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst#"_RL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst#"_AQRL"), vt>; } let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"), - !cast<RVInst>(BaseInst), vt>; - def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"), - !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"), + !cast<RVInst>(BaseInst), vt>; + def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"), + !cast<RVInst>(BaseInst), vt>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td index c691aa6..20e2142 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -44,7 +44,7 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "$rd = $rd_wb" class AMO_cas<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr, DAGOperand RC> : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO, - (outs RC:$rd_wb), (ins RC:$rd, GPRMemZeroOffset:$rs1, RC:$rs2), + (outs RC:$rd_wb), (ins RC:$rd, RC:$rs2, GPRMemZeroOffset:$rs1), opcodestr, "$rd, $rs2, $rs1">; multiclass AMO_cas_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr, @@ -71,48 +71,48 @@ defm AMOCAS_Q : AMO_cas_aq_rl<0b00101, 0b100, "amocas.q", GPRPairRV64>; multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, list<Predicate> ExtraPreds = []> { let Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) let Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) in { - def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr), + def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; - def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr), + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; + def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (XLenVT GPR:$addr), (vt GPR:$cmp), (vt GPR:$new)), - (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>; + (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>; } // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td index f7ceb0d..5f944034 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td @@ -94,11 +94,12 @@ let Predicates = [HasStdExtZalasr] in { def : PatSRL<releasing_store<atomic_store_32>, SW_RL>; def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>; +} - // Used by GISel for RV32 and RV64. +let Predicates = [HasStdExtZalasr, IsRV32] in { def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>; def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>; -} // Predicates = [HasStdExtZalasr] +} // Predicates = [HasStdExtZalasr, IsRV32] let Predicates = [HasStdExtZalasr, IsRV64] in { def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>; diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index fc14a03..f7be2a1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -32,7 +32,9 @@ class SPIRVLegalizeImplicitBinding : public ModulePass { public: static char ID; SPIRVLegalizeImplicitBinding() : ModulePass(ID) {} - + StringRef getPassName() const override { + return "SPIRV Legalize Implicit Binding"; + } bool runOnModule(Module &M) override; private: diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index c0cd0176e..f66eb9d 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -668,6 +668,7 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addImm(-1); MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB); MBB.erase(MI); + return true; } } return false; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 048cdf4..d56a1af 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1970,12 +1970,6 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, NewPhiValues.push_back(nullptr); OpsToMoveUseToIncomingBB.push_back(i); - // If the InVal is an invoke at the end of the pred block, then we can't - // insert a computation after it without breaking the edge. - if (isa<InvokeInst>(InVal)) - if (cast<Instruction>(InVal)->getParent() == InBB) - return nullptr; - // Do not push the operation across a loop backedge. This could result in // an infinite combine loop, and is generally non-profitable (especially // if the operation was originally outside the loop). diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 584cdad..e448230 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -1206,19 +1206,18 @@ private: // value for the new predecessor ClonedBB. The value will either be the same // value from BB or a cloned value. for (BasicBlock *Succ : BlocksToUpdate) { - for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II); - ++II) { - Value *Incoming = Phi->getIncomingValueForBlock(BB); + for (PHINode &Phi : Succ->phis()) { + Value *Incoming = Phi.getIncomingValueForBlock(BB); if (Incoming) { if (isa<Constant>(Incoming)) { - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); continue; } Value *ClonedVal = VMap[Incoming]; if (ClonedVal) - Phi->addIncoming(ClonedVal, ClonedBB); + Phi.addIncoming(ClonedVal, ClonedBB); else - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); } } } @@ -1313,27 +1312,19 @@ private: void cleanPhiNodes(BasicBlock *BB) { // If BB is no longer reachable, remove any remaining phi nodes if (pred_empty(BB)) { - std::vector<PHINode *> PhiToRemove; - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - PhiToRemove.push_back(Phi); - } - for (PHINode *PN : PhiToRemove) { - PN->replaceAllUsesWith(PoisonValue::get(PN->getType())); - PN->eraseFromParent(); + for (PHINode &PN : make_early_inc_range(BB->phis())) { + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + PN.eraseFromParent(); } return; } // Remove any incoming values that come from an invalid predecessor - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - std::vector<BasicBlock *> BlocksToRemove; - for (BasicBlock *IncomingBB : Phi->blocks()) { - if (!isPredecessor(BB, IncomingBB)) - BlocksToRemove.push_back(IncomingBB); - } - for (BasicBlock *BB : BlocksToRemove) - Phi->removeIncomingValue(BB); - } + for (PHINode &Phi : BB->phis()) + Phi.removeIncomingValueIf([&](unsigned Index) { + BasicBlock *IncomingBB = Phi.getIncomingBlock(Index); + return !isPredecessor(BB, IncomingBB); + }); } /// Checks if BB was already cloned for a particular next state value. If it diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cee08ef..3f16b03 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4264,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); - return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); })) return false; @@ -7282,8 +7282,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) - SE.forgetLcssaPhiWithNewPredecessor( - OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction())); + SE.forgetLcssaPhiWithNewPredecessor(OrigLoop, + &cast<VPIRPhi>(PhiR).getIRPhi()); } // Forget the original loop and block dispositions. SE.forgetLoop(OrigLoop); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 91c3d42..cfa8d27 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10621,7 +10621,8 @@ class InstructionsCompatibilityAnalysis { /// elements. static bool isSupportedOpcode(const unsigned Opcode) { return Opcode == Instruction::Add || Opcode == Instruction::LShr || - Opcode == Instruction::Shl; + Opcode == Instruction::Shl || Opcode == Instruction::SDiv || + Opcode == Instruction::UDiv; } /// Identifies the best candidate value, which represents main opcode @@ -10939,6 +10940,8 @@ public: case Instruction::Add: case Instruction::LShr: case Instruction::Shl: + case Instruction::SDiv: + case Instruction::UDiv: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -22066,8 +22069,10 @@ bool BoUpSLP::collectValuesToDemote( auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); return all_of(E.Scalars, [&](Value *V) { - auto *I = cast<Instruction>(V); APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (E.hasCopyableElements() && E.isCopyableElement(V)) + return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); + auto *I = cast<Instruction>(V); return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); }); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index b36298f..81deba2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -840,8 +840,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { // TODO: Support multiple MaxNum/MinNum reductions and other reductions. if (RedPhiR) return false; - if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && - Cur->getRecurrenceKind() != RecurKind::FMinNum) { + if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Cur->getRecurrenceKind())) { HasUnsupportedPhi = true; continue; } @@ -861,10 +861,9 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { if (!MinMaxOp) return false; - RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); - assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + RedPhiR->getRecurrenceKind()) && "unsupported reduction"); - (void)RedPhiRK; /// Check if the vector loop of \p Plan can early exit and restart /// execution of last vector iteration in the scalar loop. This requires all diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 94e2628..3a9770c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1230,6 +1230,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: |