diff options
Diffstat (limited to 'llvm/lib')
78 files changed, 886 insertions, 737 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index a5ba197..e9e2e7d 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -4056,8 +4056,8 @@ static Constant *ConstantFoldFixedVectorCall( switch (IntrinsicID) { case Intrinsic::masked_load: { auto *SrcPtr = Operands[0]; - auto *Mask = Operands[2]; - auto *Passthru = Operands[3]; + auto *Mask = Operands[1]; + auto *Passthru = Operands[2]; Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, FVTy, DL); diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index e08ef60..dc813f6 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -5440,9 +5440,10 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, // ptrtoint (ptradd (Ptr, X - ptrtoint(Ptr))) -> X Value *Ptr, *X; - if (CastOpc == Instruction::PtrToInt && - match(Op, m_PtrAdd(m_Value(Ptr), - m_Sub(m_Value(X), m_PtrToInt(m_Deferred(Ptr))))) && + if ((CastOpc == Instruction::PtrToInt || CastOpc == Instruction::PtrToAddr) && + match(Op, + m_PtrAdd(m_Value(Ptr), + m_Sub(m_Value(X), m_PtrToIntOrAddr(m_Deferred(Ptr))))) && X->getType() == Ty && Ty == Q.DL.getIndexType(Ptr->getType())) return X; @@ -6987,8 +6988,8 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee, switch (IID) { case Intrinsic::masked_load: case Intrinsic::masked_gather: { - Value *MaskArg = Args[2]; - Value *PassthruArg = Args[3]; + Value *MaskArg = Args[1]; + Value *PassthruArg = Args[2]; // If the mask is all zeros or undef, the "passthru" argument is the result. if (maskIsAllZeroOrUndef(MaskArg)) return PassthruArg; diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index 1d1a5560..9a5ae2a 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -324,32 +324,44 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice, FAM.invalidate(*Caller, PA); } Advice.updateCachedCallerFPI(FAM); - int64_t IRSizeAfter = - getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize); - CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize); + if (Caller == Callee) { + assert(!CalleeWasDeleted); + // We double-counted CallerAndCalleeEdges - since the caller and callee + // would be the same + assert(Advice.CallerAndCalleeEdges % 2 == 0); + CurrentIRSize += getIRSize(*Caller) - Advice.CallerIRSize; + EdgeCount += getCachedFPI(*Caller).DirectCallsToDefinedFunctions - + Advice.CallerAndCalleeEdges / 2; + // The NodeCount would stay the same. + } else { + int64_t IRSizeAfter = + getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize); + CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize); + + // We can delta-update module-wide features. We know the inlining only + // changed the caller, and maybe the callee (by deleting the latter). Nodes + // are simple to update. For edges, we 'forget' the edges that the caller + // and callee used to have before inlining, and add back what they currently + // have together. + int64_t NewCallerAndCalleeEdges = + getCachedFPI(*Caller).DirectCallsToDefinedFunctions; + + // A dead function's node is not actually removed from the call graph until + // the end of the call graph walk, but the node no longer belongs to any + // valid SCC. + if (CalleeWasDeleted) { + --NodeCount; + NodesInLastSCC.erase(CG.lookup(*Callee)); + DeadFunctions.insert(Callee); + } else { + NewCallerAndCalleeEdges += + getCachedFPI(*Callee).DirectCallsToDefinedFunctions; + } + EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges); + } if (CurrentIRSize > SizeIncreaseThreshold * InitialIRSize) ForceStop = true; - // We can delta-update module-wide features. We know the inlining only changed - // the caller, and maybe the callee (by deleting the latter). - // Nodes are simple to update. - // For edges, we 'forget' the edges that the caller and callee used to have - // before inlining, and add back what they currently have together. - int64_t NewCallerAndCalleeEdges = - getCachedFPI(*Caller).DirectCallsToDefinedFunctions; - - // A dead function's node is not actually removed from the call graph until - // the end of the call graph walk, but the node no longer belongs to any valid - // SCC. - if (CalleeWasDeleted) { - --NodeCount; - NodesInLastSCC.erase(CG.lookup(*Callee)); - DeadFunctions.insert(Callee); - } else { - NewCallerAndCalleeEdges += - getCachedFPI(*Callee).DirectCallsToDefinedFunctions; - } - EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges); assert(CurrentIRSize >= 0 && EdgeCount >= 0 && NodeCount >= 0); } diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index dcc5117..1c5f08e 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -245,7 +245,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, assert(ArgIdx == 0 && "Invalid argument index"); auto *Ty = cast<VectorType>(II->getType()); - if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty)) + if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(1), Ty)) return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags); return MemoryLocation( @@ -255,7 +255,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, assert(ArgIdx == 1 && "Invalid argument index"); auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType()); - if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty)) + if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty)) return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags); return MemoryLocation( diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 425420f..6f7dd79 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15473,6 +15473,38 @@ void ScalarEvolution::LoopGuards::collectFromPHI( } } +// Return a new SCEV that modifies \p Expr to the closest number divides by +// \p Divisor and less or equal than Expr. For now, only handle constant +// Expr. +static const SCEV *getPreviousSCEVDivisibleByDivisor(const SCEV *Expr, + const APInt &DivisorVal, + ScalarEvolution &SE) { + const APInt *ExprVal; + if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || + DivisorVal.isNonPositive()) + return Expr; + APInt Rem = ExprVal->urem(DivisorVal); + // return the SCEV: Expr - Expr % Divisor + return SE.getConstant(*ExprVal - Rem); +} + +// Return a new SCEV that modifies \p Expr to the closest number divides by +// \p Divisor and greater or equal than Expr. For now, only handle constant +// Expr. +static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr, + const APInt &DivisorVal, + ScalarEvolution &SE) { + const APInt *ExprVal; + if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || + DivisorVal.isNonPositive()) + return Expr; + APInt Rem = ExprVal->urem(DivisorVal); + if (Rem.isZero()) + return Expr; + // return the SCEV: Expr + Divisor - Expr % Divisor + return SE.getConstant(*ExprVal + DivisorVal - Rem); +} + void ScalarEvolution::LoopGuards::collectFromBlock( ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, const BasicBlock *Block, const BasicBlock *Pred, @@ -15540,36 +15572,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock( match(LHS, m_scev_APInt(C)) && C->isNonNegative(); }; - // Return a new SCEV that modifies \p Expr to the closest number divides by - // \p Divisor and greater or equal than Expr. For now, only handle constant - // Expr. - auto GetNextSCEVDividesByDivisor = [&](const SCEV *Expr, - const APInt &DivisorVal) { - const APInt *ExprVal; - if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || - DivisorVal.isNonPositive()) - return Expr; - APInt Rem = ExprVal->urem(DivisorVal); - if (Rem.isZero()) - return Expr; - // return the SCEV: Expr + Divisor - Expr % Divisor - return SE.getConstant(*ExprVal + DivisorVal - Rem); - }; - - // Return a new SCEV that modifies \p Expr to the closest number divides by - // \p Divisor and less or equal than Expr. For now, only handle constant - // Expr. - auto GetPreviousSCEVDividesByDivisor = [&](const SCEV *Expr, - const APInt &DivisorVal) { - const APInt *ExprVal; - if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || - DivisorVal.isNonPositive()) - return Expr; - APInt Rem = ExprVal->urem(DivisorVal); - // return the SCEV: Expr - Expr % Divisor - return SE.getConstant(*ExprVal - Rem); - }; - // Apply divisibilty by \p Divisor on MinMaxExpr with constant values, // recursively. This is done by aligning up/down the constant value to the // Divisor. @@ -15591,8 +15593,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock( assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!"); auto *DivisibleExpr = - IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, DivisorVal) - : GetNextSCEVDividesByDivisor(MinMaxLHS, DivisorVal); + IsMin + ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE) + : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE); SmallVector<const SCEV *> Ops = { ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr}; return SE.getMinMaxExpr(SCTy, Ops); @@ -15669,21 +15672,21 @@ void ScalarEvolution::LoopGuards::collectFromBlock( [[fallthrough]]; case CmpInst::ICMP_SLT: { RHS = SE.getMinusSCEV(RHS, One); - RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy); + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); break; } case CmpInst::ICMP_UGT: case CmpInst::ICMP_SGT: RHS = SE.getAddExpr(RHS, One); - RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy); + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); break; case CmpInst::ICMP_ULE: case CmpInst::ICMP_SLE: - RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy); + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); break; case CmpInst::ICMP_UGE: case CmpInst::ICMP_SGE: - RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy); + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); break; default: break; @@ -15737,7 +15740,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock( case CmpInst::ICMP_NE: if (match(RHS, m_scev_Zero())) { const SCEV *OneAlignedUp = - GetNextSCEVDividesByDivisor(One, DividesBy); + getNextSCEVDivisibleByDivisor(One, DividesBy, SE); To = SE.getUMaxExpr(FromRewritten, OneAlignedUp); } else { // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS), @@ -15963,8 +15966,11 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { if (MatchBinarySub(S, LHS, RHS)) { if (LHS > RHS) std::swap(LHS, RHS); - if (NotEqual.contains({LHS, RHS})) - return SE.getUMaxExpr(S, SE.getOne(S->getType())); + if (NotEqual.contains({LHS, RHS})) { + const SCEV *OneAlignedUp = getNextSCEVDivisibleByDivisor( + SE.getOne(S->getType()), SE.getConstantMultiple(S), SE); + return SE.getUMaxExpr(OneAlignedUp, S); + } } return nullptr; }; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 9655c88..0a72076 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7695,6 +7695,11 @@ static bool isGuaranteedNotToBeUndefOrPoison( } if (IsWellDefined) return true; + } else if (auto *Splat = isa<ShuffleVectorInst>(Opr) ? getSplatValue(Opr) + : nullptr) { + // For splats we only need to check the value being splatted. + if (OpCheck(Splat)) + return true; } else if (all_of(Opr->operands(), OpCheck)) return true; } diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 04c7008..2b5ced3 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -993,7 +993,6 @@ static void addToWorklist(Instruction &I, static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC) { SmallVector<Instruction *, 4> Worklist; - bool Modified = false; unsigned MaxLegalFpConvertBitWidth = TLI.getMaxLargeFPConvertBitWidthSupported(); @@ -1003,50 +1002,49 @@ static bool runImpl(Function &F, const TargetLowering &TLI, if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS) return false; - for (auto It = inst_begin(&F), End = inst_end(F); It != End;) { - Instruction &I = *It++; + auto ShouldHandleInst = [&](Instruction &I) { Type *Ty = I.getType(); // TODO: This pass doesn't handle scalable vectors. if (Ty->isScalableTy()) - continue; + return false; switch (I.getOpcode()) { case Instruction::FRem: - if (!targetSupportsFrem(TLI, Ty) && - FRemExpander::canExpandType(Ty->getScalarType())) { - addToWorklist(I, Worklist); - Modified = true; - } - break; + return !targetSupportsFrem(TLI, Ty) && + FRemExpander::canExpandType(Ty->getScalarType()); + case Instruction::FPToUI: case Instruction::FPToSI: { auto *IntTy = cast<IntegerType>(Ty->getScalarType()); - if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth) - continue; - - addToWorklist(I, Worklist); - Modified = true; - break; + return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth; } + case Instruction::UIToFP: case Instruction::SIToFP: { auto *IntTy = cast<IntegerType>(I.getOperand(0)->getType()->getScalarType()); - if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth) - continue; - - addToWorklist(I, Worklist); - Modified = true; - break; + return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth; } - default: - break; } + + return false; + }; + + bool Modified = false; + for (auto It = inst_begin(&F), End = inst_end(F); It != End;) { + Instruction &I = *It++; + if (!ShouldHandleInst(I)) + continue; + + addToWorklist(I, Worklist); + Modified = true; } while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); - if (I->getOpcode() == Instruction::FRem) { + + switch (I->getOpcode()) { + case Instruction::FRem: { auto SQ = [&]() -> std::optional<SimplifyQuery> { if (AC) { auto Res = std::make_optional<SimplifyQuery>( @@ -1058,11 +1056,18 @@ static bool runImpl(Function &F, const TargetLowering &TLI, }(); expandFRem(cast<BinaryOperator>(*I), SQ); - } else if (I->getOpcode() == Instruction::FPToUI || - I->getOpcode() == Instruction::FPToSI) { + break; + } + + case Instruction::FPToUI: + case Instruction::FPToSI: expandFPToI(I); - } else { + break; + + case Instruction::UIToFP: + case Instruction::SIToFP: expandIToFP(I); + break; } } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index a6a9b50..5c27a20 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -258,13 +258,11 @@ static Value *getMaskOperand(IntrinsicInst *II) { default: llvm_unreachable("Unexpected intrinsic"); case Intrinsic::vp_load: - return II->getOperand(1); case Intrinsic::masked_load: - return II->getOperand(2); + return II->getOperand(1); case Intrinsic::vp_store: - return II->getOperand(2); case Intrinsic::masked_store: - return II->getOperand(3); + return II->getOperand(2); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d..6bf9008 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26876,6 +26876,8 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, // TODO: handle more extension/truncation cases as cases arise. if (EltSizeInBits != ExtSrcSizeInBits) return SDValue(); + if (VT.getSizeInBits() != N00.getValueSizeInBits()) + return SDValue(); // We can remove *extend_vector_inreg only if the truncation happens at // the same scale as the extension. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index cb0038c..20a0efd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4837,29 +4837,10 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I, bool IsCompressing) { SDLoc sdl = getCurSDLoc(); - auto getMaskedStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0, - Align &Alignment) { - // llvm.masked.store.*(Src0, Ptr, alignment, Mask) - Src0 = I.getArgOperand(0); - Ptr = I.getArgOperand(1); - Alignment = cast<ConstantInt>(I.getArgOperand(2))->getAlignValue(); - Mask = I.getArgOperand(3); - }; - auto getCompressingStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0, - Align &Alignment) { - // llvm.masked.compressstore.*(Src0, Ptr, Mask) - Src0 = I.getArgOperand(0); - Ptr = I.getArgOperand(1); - Mask = I.getArgOperand(2); - Alignment = I.getParamAlign(1).valueOrOne(); - }; - - Value *PtrOperand, *MaskOperand, *Src0Operand; - Align Alignment; - if (IsCompressing) - getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment); - else - getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment); + Value *Src0Operand = I.getArgOperand(0); + Value *PtrOperand = I.getArgOperand(1); + Value *MaskOperand = I.getArgOperand(2); + Align Alignment = I.getParamAlign(1).valueOrOne(); SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(Src0Operand); @@ -4964,14 +4945,12 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDLoc sdl = getCurSDLoc(); - // llvm.masked.scatter.*(Src0, Ptrs, alignment, Mask) + // llvm.masked.scatter.*(Src0, Ptrs, Mask) const Value *Ptr = I.getArgOperand(1); SDValue Src0 = getValue(I.getArgOperand(0)); - SDValue Mask = getValue(I.getArgOperand(3)); + SDValue Mask = getValue(I.getArgOperand(2)); EVT VT = Src0.getValueType(); - Align Alignment = cast<ConstantInt>(I.getArgOperand(2)) - ->getMaybeAlignValue() - .value_or(DAG.getEVTAlign(VT.getScalarType())); + Align Alignment = I.getParamAlign(1).valueOrOne(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Base; @@ -5008,29 +4987,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { SDLoc sdl = getCurSDLoc(); - auto getMaskedLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0, - Align &Alignment) { - // @llvm.masked.load.*(Ptr, alignment, Mask, Src0) - Ptr = I.getArgOperand(0); - Alignment = cast<ConstantInt>(I.getArgOperand(1))->getAlignValue(); - Mask = I.getArgOperand(2); - Src0 = I.getArgOperand(3); - }; - auto getExpandingLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0, - Align &Alignment) { - // @llvm.masked.expandload.*(Ptr, Mask, Src0) - Ptr = I.getArgOperand(0); - Alignment = I.getParamAlign(0).valueOrOne(); - Mask = I.getArgOperand(1); - Src0 = I.getArgOperand(2); - }; - - Value *PtrOperand, *MaskOperand, *Src0Operand; - Align Alignment; - if (IsExpanding) - getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment); - else - getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment); + Value *PtrOperand = I.getArgOperand(0); + Value *MaskOperand = I.getArgOperand(1); + Value *Src0Operand = I.getArgOperand(2); + Align Alignment = I.getParamAlign(0).valueOrOne(); SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(Src0Operand); @@ -5077,16 +5037,14 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDLoc sdl = getCurSDLoc(); - // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) + // @llvm.masked.gather.*(Ptrs, Mask, Src0) const Value *Ptr = I.getArgOperand(0); - SDValue Src0 = getValue(I.getArgOperand(3)); - SDValue Mask = getValue(I.getArgOperand(2)); + SDValue Src0 = getValue(I.getArgOperand(2)); + SDValue Mask = getValue(I.getArgOperand(1)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - Align Alignment = cast<ConstantInt>(I.getArgOperand(1)) - ->getMaybeAlignValue() - .value_or(DAG.getEVTAlign(VT.getScalarType())); + Align Alignment = I.getParamAlign(0).valueOrOne(); const MDNode *Ranges = getRangeMetadata(I); diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 8052773..8637b55 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -2427,11 +2427,13 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { uint64_t OrigStmtSeq = StmtSeq.get(); // 1. Get the original row index from the stmt list offset. auto OrigRowIter = SeqOffToOrigRow.find(OrigStmtSeq); + const uint64_t InvalidOffset = + Unit.getOrigUnit().getFormParams().getDwarfMaxOffset(); // Check whether we have an output sequence for the StmtSeq offset. // Some sequences are discarded by the DWARFLinker if they are invalid // (empty). if (OrigRowIter == SeqOffToOrigRow.end()) { - StmtSeq.set(UINT64_MAX); + StmtSeq.set(InvalidOffset); continue; } size_t OrigRowIndex = OrigRowIter->second; @@ -2441,7 +2443,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { if (NewRowIter == OrigRowToNewRow.end()) { // If the original row index is not found in the map, update the // stmt_sequence attribute to the 'invalid offset' magic value. - StmtSeq.set(UINT64_MAX); + StmtSeq.set(InvalidOffset); continue; } diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index fa39603..a326a01 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -320,12 +320,16 @@ static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI, // Attempt to retrieve DW_AT_LLVM_stmt_sequence if present. std::optional<uint64_t> StmtSeqOffset; if (auto StmtSeqAttr = Die.find(llvm::dwarf::DW_AT_LLVM_stmt_sequence)) { - // The `DW_AT_LLVM_stmt_sequence` attribute might be set to `UINT64_MAX` - // when it refers to an empty line sequence. In such cases, the DWARF linker - // will exclude the empty sequence from the final output and assign - // `UINT64_MAX` to the `DW_AT_LLVM_stmt_sequence` attribute. - uint64_t StmtSeqVal = dwarf::toSectionOffset(StmtSeqAttr, UINT64_MAX); - if (StmtSeqVal != UINT64_MAX) + // The `DW_AT_LLVM_stmt_sequence` attribute might be set to an invalid + // sentinel value when it refers to an empty line sequence. In such cases, + // the DWARF linker will exclude the empty sequence from the final output + // and assign the sentinel value to the `DW_AT_LLVM_stmt_sequence` + // attribute. The sentinel value is UINT32_MAX for DWARF32 and UINT64_MAX + // for DWARF64. + const uint64_t InvalidOffset = + Die.getDwarfUnit()->getFormParams().getDwarfMaxOffset(); + uint64_t StmtSeqVal = dwarf::toSectionOffset(StmtSeqAttr, InvalidOffset); + if (StmtSeqVal != InvalidOffset) StmtSeqOffset = StmtSeqVal; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d8374b6..10f915d 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1426,6 +1426,28 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, Intrinsic::memset, ParamTypes); return true; } + + unsigned MaskedID = + StringSwitch<unsigned>(Name) + .StartsWith("masked.load", Intrinsic::masked_load) + .StartsWith("masked.gather", Intrinsic::masked_gather) + .StartsWith("masked.store", Intrinsic::masked_store) + .StartsWith("masked.scatter", Intrinsic::masked_scatter) + .Default(0); + if (MaskedID && F->arg_size() == 4) { + rename(F); + if (MaskedID == Intrinsic::masked_load || + MaskedID == Intrinsic::masked_gather) { + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), MaskedID, + {F->getReturnType(), F->getArg(0)->getType()}); + return true; + } + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), MaskedID, + {F->getArg(0)->getType(), F->getArg(1)->getType()}); + return true; + } break; } case 'n': { @@ -5231,6 +5253,54 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { break; } + case Intrinsic::masked_load: + case Intrinsic::masked_gather: + case Intrinsic::masked_store: + case Intrinsic::masked_scatter: { + if (CI->arg_size() != 4) { + DefaultCase(); + return; + } + + const DataLayout &DL = CI->getDataLayout(); + switch (NewFn->getIntrinsicID()) { + case Intrinsic::masked_load: + NewCall = Builder.CreateMaskedLoad( + CI->getType(), CI->getArgOperand(0), + cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(), + CI->getArgOperand(2), CI->getArgOperand(3)); + break; + case Intrinsic::masked_gather: + NewCall = Builder.CreateMaskedGather( + CI->getType(), CI->getArgOperand(0), + DL.getValueOrABITypeAlignment( + cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(), + CI->getType()->getScalarType()), + CI->getArgOperand(2), CI->getArgOperand(3)); + break; + case Intrinsic::masked_store: + NewCall = Builder.CreateMaskedStore( + CI->getArgOperand(0), CI->getArgOperand(1), + cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(), + CI->getArgOperand(3)); + break; + case Intrinsic::masked_scatter: + NewCall = Builder.CreateMaskedScatter( + CI->getArgOperand(0), CI->getArgOperand(1), + DL.getValueOrABITypeAlignment( + cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(), + CI->getArgOperand(0)->getType()->getScalarType()), + CI->getArgOperand(3)); + break; + default: + llvm_unreachable("Unexpected intrinsic ID"); + } + // Previous metadata is still valid. + NewCall->copyMetadata(*CI); + NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind()); + break; + } + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { if (CI->arg_size() != 2) { diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 15c0198..88dbd17 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -495,9 +495,11 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, if (!PassThru) PassThru = PoisonValue::get(Ty); Type *OverloadedTypes[] = { Ty, PtrTy }; - Value *Ops[] = {Ptr, getInt32(Alignment.value()), Mask, PassThru}; - return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, - OverloadedTypes, Name); + Value *Ops[] = {Ptr, Mask, PassThru}; + CallInst *CI = + CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, OverloadedTypes, Name); + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment)); + return CI; } /// Create a call to a Masked Store intrinsic. @@ -513,8 +515,11 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr, assert(DataTy->isVectorTy() && "Val should be a vector"); assert(Mask && "Mask should not be all-ones (null)"); Type *OverloadedTypes[] = { DataTy, PtrTy }; - Value *Ops[] = {Val, Ptr, getInt32(Alignment.value()), Mask}; - return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes); + Value *Ops[] = {Val, Ptr, Mask}; + CallInst *CI = + CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes); + CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), Alignment)); + return CI; } /// Create a call to a Masked intrinsic, with given intrinsic Id, @@ -552,12 +557,14 @@ CallInst *IRBuilderBase::CreateMaskedGather(Type *Ty, Value *Ptrs, PassThru = PoisonValue::get(Ty); Type *OverloadedTypes[] = {Ty, PtrsTy}; - Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru}; + Value *Ops[] = {Ptrs, Mask, PassThru}; // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes, - Name); + CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, + OverloadedTypes, Name); + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment)); + return CI; } /// Create a call to a Masked Scatter intrinsic. @@ -577,11 +584,14 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, Mask = getAllOnesMask(NumElts); Type *OverloadedTypes[] = {DataTy, PtrsTy}; - Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask}; + Value *Ops[] = {Data, Ptrs, Mask}; // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes); + CallInst *CI = + CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes); + CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), Alignment)); + return CI; } /// Create a call to Masked Expand Load intrinsic diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 6797a10..526800e 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -725,6 +725,19 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id, // There can never be multiple globals with the same name of different types, // because intrinsics must be a specific type. auto *FT = getType(M->getContext(), id, Tys); + Function *F = cast<Function>( + M->getOrInsertFunction( + Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT) + .getCallee()); + if (F->getFunctionType() == FT) + return F; + + // It's possible that a declaration for this intrinsic already exists with an + // incorrect signature, if the signature has changed, but this particular + // declaration has not been auto-upgraded yet. In that case, rename the + // invalid declaration and insert a new one with the correct signature. The + // invalid declaration will get upgraded later. + F->setName(F->getName() + ".invalid"); return cast<Function>( M->getOrInsertFunction( Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 3572852..03da154 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6211,13 +6211,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Check(Call.getType()->isVectorTy(), "masked_load: must return a vector", Call); - ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(1)); - Value *Mask = Call.getArgOperand(2); - Value *PassThru = Call.getArgOperand(3); + Value *Mask = Call.getArgOperand(1); + Value *PassThru = Call.getArgOperand(2); Check(Mask->getType()->isVectorTy(), "masked_load: mask must be vector", Call); - Check(Alignment->getValue().isPowerOf2(), - "masked_load: alignment must be a power of 2", Call); Check(PassThru->getType() == Call.getType(), "masked_load: pass through and return type must match", Call); Check(cast<VectorType>(Mask->getType())->getElementCount() == @@ -6227,33 +6224,15 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } case Intrinsic::masked_store: { Value *Val = Call.getArgOperand(0); - ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(2)); - Value *Mask = Call.getArgOperand(3); + Value *Mask = Call.getArgOperand(2); Check(Mask->getType()->isVectorTy(), "masked_store: mask must be vector", Call); - Check(Alignment->getValue().isPowerOf2(), - "masked_store: alignment must be a power of 2", Call); Check(cast<VectorType>(Mask->getType())->getElementCount() == cast<VectorType>(Val->getType())->getElementCount(), "masked_store: vector mask must be same length as value", Call); break; } - case Intrinsic::masked_gather: { - const APInt &Alignment = - cast<ConstantInt>(Call.getArgOperand(1))->getValue(); - Check(Alignment.isZero() || Alignment.isPowerOf2(), - "masked_gather: alignment must be 0 or a power of 2", Call); - break; - } - case Intrinsic::masked_scatter: { - const APInt &Alignment = - cast<ConstantInt>(Call.getArgOperand(2))->getValue(); - Check(Alignment.isZero() || Alignment.isPowerOf2(), - "masked_scatter: alignment must be 0 or a power of 2", Call); - break; - } - case Intrinsic::experimental_guard: { Check(isa<CallInst>(Call), "experimental_guard cannot be invoked", Call); Check(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1, diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 7f0ea78..d4901d9 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -2903,7 +2903,7 @@ bool MasmParser::parseIdentifier(StringRef &Res, if (Position == StartOfStatement && StringSwitch<bool>(Res) .CaseLower("echo", true) - .CasesLower("ifdef", "ifndef", "elseifdef", "elseifndef", true) + .CasesLower({"ifdef", "ifndef", "elseifdef", "elseifndef"}, true) .Default(false)) { ExpandNextToken = DoNotExpandMacros; } diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index b4de79a..4787604 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -2600,8 +2600,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, int exponentChange = omsb - fromSemantics.precision; if (exponent + exponentChange < toSemantics.minExponent) exponentChange = toSemantics.minExponent - exponent; - if (exponentChange < shift) - exponentChange = shift; + exponentChange = std::max(exponentChange, shift); if (exponentChange < 0) { shift -= exponentChange; exponent += exponentChange; @@ -3043,8 +3042,7 @@ IEEEFloat::roundSignificandWithExponent(const integerPart *decSigParts, if (decSig.exponent < semantics->minExponent) { excessPrecision += (semantics->minExponent - decSig.exponent); truncatedBits = excessPrecision; - if (excessPrecision > calcSemantics.precision) - excessPrecision = calcSemantics.precision; + excessPrecision = std::min(excessPrecision, calcSemantics.precision); } /* Extra half-ulp lost in reciprocal of exponent. */ powHUerr = (powStatus == opOK && calcLostFraction == lfExactlyZero) ? 0:2; @@ -3441,8 +3439,7 @@ char *IEEEFloat::convertNormalToHexString(char *dst, unsigned int hexDigits, /* Convert as much of "part" to hexdigits as we can. */ unsigned int curDigits = integerPartWidth / 4; - if (curDigits > outputDigits) - curDigits = outputDigits; + curDigits = std::min(curDigits, outputDigits); dst += partAsHex (dst, part, curDigits, hexDigitChars); outputDigits -= curDigits; } diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index 573ad82..78d6540 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -868,8 +868,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { nwidth = strlen(name) - 1; } - if (nwidth > width) - width = nwidth; + width = std::max(nwidth, width); } for (int i = 0; i < depth; ++i) { diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp index 9801627..e9660ac1 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -585,7 +585,7 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize, uint64_t ShiftedMask = (0xFFFFULL << Shift); uint64_t ZeroChunk = UImm & ~ShiftedMask; uint64_t OneChunk = UImm | ShiftedMask; - uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); + uint64_t RotatedImm = llvm::rotl(UImm, 32); uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 12c600f..d5117da 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -700,7 +700,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { // csel instruction. If so, return the folded opcode, and the replacement // register. static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, - unsigned *NewVReg = nullptr) { + unsigned *NewReg = nullptr) { VReg = removeCopies(MRI, VReg); if (!Register::isVirtualRegister(VReg)) return 0; @@ -708,8 +708,37 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); const MachineInstr *DefMI = MRI.getVRegDef(VReg); unsigned Opc = 0; - unsigned SrcOpNum = 0; + unsigned SrcReg = 0; switch (DefMI->getOpcode()) { + case AArch64::SUBREG_TO_REG: + // Check for the following way to define an 64-bit immediate: + // %0:gpr32 = MOVi32imm 1 + // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32 + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0) + return 0; + if (!DefMI->getOperand(2).isReg()) + return 0; + if (!DefMI->getOperand(3).isImm() || + DefMI->getOperand(3).getImm() != AArch64::sub_32) + return 0; + DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg()); + if (DefMI->getOpcode() != AArch64::MOVi32imm) + return 0; + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1) + return 0; + assert(Is64Bit); + SrcReg = AArch64::XZR; + Opc = AArch64::CSINCXr; + break; + + case AArch64::MOVi32imm: + case AArch64::MOVi64imm: + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1) + return 0; + SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; + break; + case AArch64::ADDSXri: case AArch64::ADDSWri: // if NZCV is used, do not fold. @@ -724,7 +753,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || DefMI->getOperand(3).getImm() != 0) return 0; - SrcOpNum = 1; + SrcReg = DefMI->getOperand(1).getReg(); Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; break; @@ -734,7 +763,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; - SrcOpNum = 2; + SrcReg = DefMI->getOperand(2).getReg(); Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; break; } @@ -753,17 +782,17 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; - SrcOpNum = 2; + SrcReg = DefMI->getOperand(2).getReg(); Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; break; } default: return 0; } - assert(Opc && SrcOpNum && "Missing parameters"); + assert(Opc && SrcReg && "Missing parameters"); - if (NewVReg) - *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); + if (NewReg) + *NewReg = SrcReg; return Opc; } @@ -964,28 +993,34 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, // Try folding simple instructions into the csel. if (TryFold) { - unsigned NewVReg = 0; - unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); + unsigned NewReg = 0; + unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg); if (FoldedOpc) { // The folded opcodes csinc, csinc and csneg apply the operation to // FalseReg, so we need to invert the condition. CC = AArch64CC::getInvertedCondCode(CC); TrueReg = FalseReg; } else - FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); + FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg); // Fold the operation. Leave any dead instructions for DCE to clean up. if (FoldedOpc) { - FalseReg = NewVReg; + FalseReg = NewReg; Opc = FoldedOpc; - // The extends the live range of NewVReg. - MRI.clearKillFlags(NewVReg); + // Extend the live range of NewReg. + MRI.clearKillFlags(NewReg); } } // Pull all virtual register into the appropriate class. MRI.constrainRegClass(TrueReg, RC); - MRI.constrainRegClass(FalseReg, RC); + // FalseReg might be WZR or XZR if the folded operand is a literal 1. + assert( + (FalseReg.isVirtual() || FalseReg == AArch64::WZR || + FalseReg == AArch64::XZR) && + "FalseReg was folded into a non-virtual register other than WZR or XZR"); + if (FalseReg.isVirtual()) + MRI.constrainRegClass(FalseReg, RC); // Insert the csel. BuildMI(MBB, I, DL, get(Opc), DstReg) @@ -5063,7 +5098,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool RenamableDest, bool RenamableSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && - (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { + AArch64::GPR32spRegClass.contains(SrcReg)) { if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { // If either operand is WSP, expand to ADD #0. if (Subtarget.hasZeroCycleRegMoveGPR64() && @@ -5088,21 +5123,14 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } - } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) { - BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) - .addImm(0) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else if (Subtarget.hasZeroCycleRegMoveGPR64() && !Subtarget.hasZeroCycleRegMoveGPR32()) { // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); assert(DestRegX.isValid() && "Destination super-reg not valid"); - MCRegister SrcRegX = - SrcReg == AArch64::WZR - ? AArch64::XZR - : RI.getMatchingSuperReg(SrcReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); + MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); assert(SrcRegX.isValid() && "Source super-reg not valid"); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate @@ -5121,6 +5149,51 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // GPR32 zeroing + if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) { + if (Subtarget.hasZeroCycleZeroingGPR32()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) + .addReg(AArch64::WZR) + .addReg(AArch64::WZR); + } + return; + } + + if (AArch64::GPR64spRegClass.contains(DestReg) && + AArch64::GPR64spRegClass.contains(SrcReg)) { + if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { + // If either operand is SP, expand to ADD #0. + BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // Otherwise, expand to ORR XZR. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) + .addReg(AArch64::XZR) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + // GPR64 zeroing + if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) { + if (Subtarget.hasZeroCycleZeroingGPR64()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) + .addReg(AArch64::XZR) + .addReg(AArch64::XZR); + } + return; + } + // Copy a Predicate register by ORRing with itself. if (AArch64::PPRRegClass.contains(DestReg) && AArch64::PPRRegClass.contains(SrcReg)) { @@ -5205,27 +5278,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (AArch64::GPR64spRegClass.contains(DestReg) && - (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { - if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { - // If either operand is SP, expand to ADD #0. - BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) { - BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) - .addImm(0) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else { - // Otherwise, expand to ORR XZR. - BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) - .addReg(AArch64::XZR) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - // Copy a DDDD register quad by copying the individual sub-registers. if (AArch64::DDDDRegClass.contains(DestReg) && AArch64::DDDDRegClass.contains(SrcReg)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp index 19e2a6a..93732a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp @@ -244,11 +244,8 @@ void getInterestingMemoryOperands( // Masked store has an initial operand for the value. unsigned OpOffset = IsWrite ? 1 : 0; Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType(); - MaybeAlign Alignment = Align(1); - // Otherwise no alignment guarantees. We probably got Undef. - if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) - Alignment = Op->getMaybeAlignValue(); - Value *Mask = CI->getOperand(2 + OpOffset); + MaybeAlign Alignment = CI->getParamAlign(OpOffset); + Value *Mask = CI->getOperand(1 + OpOffset); Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask); break; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index f291e37..c8bbcbb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, : // clang-format off AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), - TargetTriple(TT), TargetID(*this), InstrItins(getInstrItineraryForCPU(GPU)), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index c2e6078..a466780 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -60,7 +60,6 @@ private: protected: // Basic subtarget description. - Triple TargetTriple; AMDGPU::IsaInfo::AMDGPUTargetID TargetID; unsigned Gen = INVALID; InstrItineraryData InstrItins; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0189e7b..5c39f7a 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1034,16 +1034,13 @@ void SIFrameLowering::emitCSRSpillStores( StoreWWMRegisters(WWMCalleeSavedRegs); if (FuncInfo->isWholeWaveFunction()) { - // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove - // it now. If we have already saved some WWM CSR registers, then the EXEC is - // already -1 and we don't need to do anything else. Otherwise, set EXEC to - // -1 here. + // If we have already saved some WWM CSR registers, then the EXEC is already + // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here. if (!ScratchExecCopy) buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ true); else if (WWMCalleeSavedRegs.empty()) EnableAllLanes(); - TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg) @@ -1340,6 +1337,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, "Needed to save BP but didn't save it anywhere"); assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); + + if (FuncInfo->isWholeWaveFunction()) { + // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose. + TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); + } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 27e5ee9c..74d4153 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3481,30 +3481,6 @@ def : GCNPat< >; } // End True16Predicate -let True16Predicate = UseRealTrue16Insts in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) ->; - -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) ->; -} // End True16Predicate - -let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) ->; - -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) ->; -} // End True16Predicate - def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 484861d..362ef14 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -360,11 +360,13 @@ public: /// between memory instructions to enforce the order they become visible as /// observed by other memory instructions executing in memory scope \p Scope. /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between - /// address spaces. Returns true iff any instructions inserted. + /// address spaces. If \p AtomicsOnly is true, only insert waits for counters + /// that are used by atomic instructions. + /// Returns true iff any instructions inserted. virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const = 0; + AtomicOrdering Order, bool AtomicsOnly) const = 0; /// Inserts any necessary instructions at position \p Pos relative to /// instruction \p MI to ensure any subsequent memory instructions of this @@ -437,7 +439,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -484,7 +486,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -572,7 +574,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -629,7 +631,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; @@ -1120,7 +1122,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1140,7 +1143,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const { + AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -1294,7 +1298,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, bool IsCrossAddrSpaceOrdering, Position Pos) const { return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); } bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -1447,7 +1452,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1467,8 +1473,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, - AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { if (ST.isTgSplitEnabled()) { // In threadgroup split mode the waves of a work-group can be executing on // different CUs. Therefore need to wait for global or GDS memory operations @@ -1488,7 +1494,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, AddrSpace &= ~SIAtomicAddrSpace::LDS; } return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, - IsCrossAddrSpaceOrdering, Pos, Order); + IsCrossAddrSpaceOrdering, Pos, Order, + AtomicsOnly); } bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -1747,7 +1754,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1904,7 +1912,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other // S_WAITCNT needed. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } @@ -1984,7 +1993,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -2007,7 +2017,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -2444,7 +2457,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { + if (!AtomicsOnly && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2587,7 +2600,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } @@ -2624,7 +2638,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); } return Changed; @@ -2748,13 +2763,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE, Order); + Position::BEFORE, Order, /*AtomicsOnly=*/false); if (Order == AtomicOrdering::Acquire || Order == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // The wait below only needs to wait on the prior atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER, Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2830,9 +2847,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); if (Order == AtomicOrdering::Acquire) { - Changed |= CC->insertWait( - MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order); + // Acquire fences only need to wait on the previous atomic they pair with. + Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace, + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE, Order, /*AtomicsOnly=*/true); } if (Order == AtomicOrdering::Release || @@ -2897,10 +2916,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Order == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // Only wait on the previous atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, + Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index ce59ae0..2cd5f02 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -407,9 +407,9 @@ Instruction *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { // Potentially optimising the addressing modes as we do so. auto *Ty = cast<FixedVectorType>(I->getType()); Value *Ptr = I->getArgOperand(0); - Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue(); - Value *Mask = I->getArgOperand(2); - Value *PassThru = I->getArgOperand(3); + Align Alignment = I->getParamAlign(0).valueOrOne(); + Value *Mask = I->getArgOperand(1); + Value *PassThru = I->getArgOperand(2); if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), Alignment)) @@ -458,7 +458,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBase( if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; - Value *Mask = I->getArgOperand(2); + Value *Mask = I->getArgOperand(1); if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, {Ty, Ptr->getType()}, @@ -479,7 +479,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; - Value *Mask = I->getArgOperand(2); + Value *Mask = I->getArgOperand(1); if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb, {Ty, Ptr->getType()}, @@ -552,7 +552,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( return nullptr; Root = Extend; - Value *Mask = I->getArgOperand(2); + Value *Mask = I->getArgOperand(1); Instruction *Load = nullptr; if (!match(Mask, m_One())) Load = Builder.CreateIntrinsic( @@ -584,7 +584,7 @@ Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { // Potentially optimising the addressing modes as we do so. Value *Input = I->getArgOperand(0); Value *Ptr = I->getArgOperand(1); - Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue(); + Align Alignment = I->getParamAlign(1).valueOrOne(); auto *Ty = cast<FixedVectorType>(Input->getType()); if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), @@ -622,7 +622,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBase( // Can't build an intrinsic for this return nullptr; } - Value *Mask = I->getArgOperand(3); + Value *Mask = I->getArgOperand(2); // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); if (match(Mask, m_One())) @@ -646,7 +646,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB( if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; - Value *Mask = I->getArgOperand(3); + Value *Mask = I->getArgOperand(2); if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb, {Ptr->getType(), Input->getType()}, @@ -662,7 +662,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { using namespace PatternMatch; Value *Input = I->getArgOperand(0); - Value *Mask = I->getArgOperand(3); + Value *Mask = I->getArgOperand(2); Type *InputTy = Input->getType(); Type *MemoryTy = InputTy; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index e4c0a16..9ab5202 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -300,7 +300,6 @@ private: const_iterator end() const { return Blocks.end(); } }; - Align getAlignFromValue(const Value *V) const; std::optional<AddrInfo> getAddrInfo(Instruction &In) const; bool isHvx(const AddrInfo &AI) const; // This function is only used for assertions at the moment. @@ -612,12 +611,6 @@ auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> { return Values; } -auto AlignVectors::getAlignFromValue(const Value *V) const -> Align { - const auto *C = dyn_cast<ConstantInt>(V); - assert(C && "Alignment must be a compile-time constant integer"); - return C->getAlignValue(); -} - auto AlignVectors::getAddrInfo(Instruction &In) const -> std::optional<AddrInfo> { if (auto *L = isCandidate<LoadInst>(&In)) @@ -631,11 +624,11 @@ auto AlignVectors::getAddrInfo(Instruction &In) const switch (ID) { case Intrinsic::masked_load: return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(), - getAlignFromValue(II->getArgOperand(1))); + II->getParamAlign(0).valueOrOne()); case Intrinsic::masked_store: return AddrInfo(HVC, II, II->getArgOperand(1), II->getArgOperand(0)->getType(), - getAlignFromValue(II->getArgOperand(2))); + II->getParamAlign(1).valueOrOne()); } } return std::nullopt; @@ -660,9 +653,9 @@ auto AlignVectors::getMask(Value *Val) const -> Value * { if (auto *II = dyn_cast<IntrinsicInst>(Val)) { switch (II->getIntrinsicID()) { case Intrinsic::masked_load: - return II->getArgOperand(2); + return II->getArgOperand(1); case Intrinsic::masked_store: - return II->getArgOperand(3); + return II->getArgOperand(2); } } @@ -675,7 +668,7 @@ auto AlignVectors::getMask(Value *Val) const -> Value * { auto AlignVectors::getPassThrough(Value *Val) const -> Value * { if (auto *II = dyn_cast<IntrinsicInst>(Val)) { if (II->getIntrinsicID() == Intrinsic::masked_load) - return II->getArgOperand(3); + return II->getArgOperand(2); } return UndefValue::get(getPayload(Val)->getType()); } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 5143d53..613dea6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2025,10 +2025,10 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)), sub_128)>; // abs -def : Pat<(abs v32i8:$xj), (XVMAX_B v32i8:$xj, (XVNEG_B v32i8:$xj))>; -def : Pat<(abs v16i16:$xj), (XVMAX_H v16i16:$xj, (XVNEG_H v16i16:$xj))>; -def : Pat<(abs v8i32:$xj), (XVMAX_W v8i32:$xj, (XVNEG_W v8i32:$xj))>; -def : Pat<(abs v4i64:$xj), (XVMAX_D v4i64:$xj, (XVNEG_D v4i64:$xj))>; +def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>; +def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>; +def : Pat<(abs v8i32:$xj), (XVSIGNCOV_W v8i32:$xj, v8i32:$xj)>; +def : Pat<(abs v4i64:$xj), (XVSIGNCOV_D v4i64:$xj, v4i64:$xj)>; // XVABSD_{B/H/W/D}[U] defm : PatXrXr<abds, "XVABSD">; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 8d1dc99..4619c6b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -2155,10 +2155,10 @@ def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; // abs -def : Pat<(abs v16i8:$vj), (VMAX_B v16i8:$vj, (VNEG_B v16i8:$vj))>; -def : Pat<(abs v8i16:$vj), (VMAX_H v8i16:$vj, (VNEG_H v8i16:$vj))>; -def : Pat<(abs v4i32:$vj), (VMAX_W v4i32:$vj, (VNEG_W v4i32:$vj))>; -def : Pat<(abs v2i64:$vj), (VMAX_D v2i64:$vj, (VNEG_D v2i64:$vj))>; +def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>; +def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>; +def : Pat<(abs v4i32:$vj), (VSIGNCOV_W v4i32:$vj, v4i32:$vj)>; +def : Pat<(abs v2i64:$vj), (VSIGNCOV_D v2i64:$vj, v2i64:$vj)>; // VABSD_{B/H/W/D}[U] defm : PatVrVr<abds, "VABSD">; diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp index ab93bba..b00589a 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp @@ -68,7 +68,7 @@ const llvm::StringRef RISCVSEWInstrument::DESC_NAME = "RISCV-SEW"; bool RISCVSEWInstrument::isDataValid(llvm::StringRef Data) { // Return true if not one of the valid SEW strings return StringSwitch<bool>(Data) - .Cases("E8", "E16", "E32", "E64", true) + .Cases({"E8", "E16", "E32", "E64"}, true) .Default(false); } diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 52dc53e..25b5af8 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -495,18 +495,19 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr, bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { VectorType *DataType; Value *StoreVal = nullptr, *Ptr, *Mask, *EVL = nullptr; - MaybeAlign MA; + Align Alignment; switch (II->getIntrinsicID()) { case Intrinsic::masked_gather: DataType = cast<VectorType>(II->getType()); Ptr = II->getArgOperand(0); - MA = cast<ConstantInt>(II->getArgOperand(1))->getMaybeAlignValue(); - Mask = II->getArgOperand(2); + Alignment = II->getParamAlign(0).valueOrOne(); + Mask = II->getArgOperand(1); break; case Intrinsic::vp_gather: DataType = cast<VectorType>(II->getType()); Ptr = II->getArgOperand(0); - MA = II->getParamAlign(0).value_or( + // FIXME: Falling back to ABI alignment is incorrect. + Alignment = II->getParamAlign(0).value_or( DL->getABITypeAlign(DataType->getElementType())); Mask = II->getArgOperand(1); EVL = II->getArgOperand(2); @@ -515,14 +516,15 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { DataType = cast<VectorType>(II->getArgOperand(0)->getType()); StoreVal = II->getArgOperand(0); Ptr = II->getArgOperand(1); - MA = cast<ConstantInt>(II->getArgOperand(2))->getMaybeAlignValue(); - Mask = II->getArgOperand(3); + Alignment = II->getParamAlign(1).valueOrOne(); + Mask = II->getArgOperand(2); break; case Intrinsic::vp_scatter: DataType = cast<VectorType>(II->getArgOperand(0)->getType()); StoreVal = II->getArgOperand(0); Ptr = II->getArgOperand(1); - MA = II->getParamAlign(1).value_or( + // FIXME: Falling back to ABI alignment is incorrect. + Alignment = II->getParamAlign(1).value_or( DL->getABITypeAlign(DataType->getElementType())); Mask = II->getArgOperand(2); EVL = II->getArgOperand(3); @@ -533,7 +535,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { // Make sure the operation will be supported by the backend. EVT DataTypeVT = TLI->getValueType(*DL, DataType); - if (!MA || !TLI->isLegalStridedLoadStore(DataTypeVT, *MA)) + if (!TLI->isLegalStridedLoadStore(DataTypeVT, Alignment)) return false; // FIXME: Let the backend type legalize by splitting/widening? @@ -571,7 +573,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { // Merge llvm.masked.gather's passthru if (II->getIntrinsicID() == Intrinsic::masked_gather) - Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(3)); + Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(2)); } else Call = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_store, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index 9358486..f7d1a09 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -438,130 +438,6 @@ let Predicates = [HasStdExtZvfbfmin] in { FRM_DYN, fvti.AVL, fvti.Log2SEW, TA_MA)>; } - - defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBF16Vectors>; - defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", - AllBF16Vectors, uimm5>; - defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", - eew=16, vtilist=AllBF16Vectors>; - defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllBF16Vectors, uimm5>; - defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllBF16Vectors, uimm5>; - - foreach fvti = AllBF16Vectors in { - defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM", - fvti.Vector, - fvti.Vector, fvti.Vector, fvti.Mask, - fvti.Log2SEW, fvti.LMul, fvti.RegClass, - fvti.RegClass, fvti.RegClass>; - defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE", - "V"#fvti.ScalarSuffix#"M", - fvti.Vector, - fvti.Vector, fvti.Scalar, fvti.Mask, - fvti.Log2SEW, fvti.LMul, fvti.RegClass, - fvti.RegClass, fvti.ScalarRegClass>; - defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX); - def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru), - (fvti.Vector fvti.RegClass:$rs2), - (fvti.Scalar (fpimm0)), - (fvti.Mask VMV0:$vm), VLOpFrag)), - (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; - - defvar ivti = GetIntVTypeInfo<fvti>.Vti; - def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, - fvti.RegClass:$rs2)), - (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), - fvti.AVL, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), - (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), - fvti.RegClass:$rs2)), - (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), - (SplatFPOp (fvti.Scalar fpimm0)), - fvti.RegClass:$rs2)), - (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), - (SplatFPOp fvti.ScalarRegClass:$rs1), - fvti.RegClass:$rs2)), - (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, - (fvti.Scalar fvti.ScalarRegClass:$rs1), - (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), - fvti.RegClass:$rs1, - fvti.RegClass:$rs2, - fvti.RegClass:$passthru, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), - GPR:$vl, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), - (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), - fvti.RegClass:$rs2, - fvti.RegClass:$passthru, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), - GPR:$vl, fvti.Log2SEW)>; - - - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), - (SplatFPOp (fvti.Scalar fpimm0)), - fvti.RegClass:$rs2, - fvti.RegClass:$passthru, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), - GPR:$vl, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), - (SplatFPOp fvti.ScalarRegClass:$rs1), - fvti.RegClass:$rs2, - fvti.RegClass:$passthru, - VLOpFrag)), - (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, - (fvti.Scalar fvti.ScalarRegClass:$rs1), - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector - (riscv_vrgather_vv_vl fvti.RegClass:$rs2, - (ivti.Vector fvti.RegClass:$rs1), - fvti.RegClass:$passthru, - (fvti.Mask VMV0:$vm), - VLOpFrag)), - (!cast<Instruction>("PseudoVRGATHER_VV_"# fvti.LMul.MX#"_E"# fvti.SEW#"_MASK") - fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(fvti.Vector (riscv_vrgather_vx_vl fvti.RegClass:$rs2, GPR:$rs1, - fvti.RegClass:$passthru, - (fvti.Mask VMV0:$vm), - VLOpFrag)), - (!cast<Instruction>("PseudoVRGATHER_VX_"# fvti.LMul.MX#"_MASK") - fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$rs1, - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(fvti.Vector - (riscv_vrgather_vx_vl fvti.RegClass:$rs2, - uimm5:$imm, - fvti.RegClass:$passthru, - (fvti.Mask VMV0:$vm), - VLOpFrag)), - (!cast<Instruction>("PseudoVRGATHER_VI_"# fvti.LMul.MX#"_MASK") - fvti.RegClass:$passthru, fvti.RegClass:$rs2, uimm5:$imm, - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; - } } let Predicates = [HasStdExtZvfbfwma] in { diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 5e10631..528bbdf 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -169,9 +169,9 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, } case Intrinsic::masked_load: { Ptr = II->getOperand(0); - Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue(); + Alignment = II->getParamAlign(0).valueOrOne(); - if (!isa<UndefValue>(II->getOperand(3))) + if (!isa<UndefValue>(II->getOperand(2))) return false; assert(Mask && "masked.load needs a mask!"); @@ -183,7 +183,7 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, } case Intrinsic::masked_store: { Ptr = II->getOperand(1); - Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue(); + Alignment = II->getParamAlign(1).valueOrOne(); assert(Mask && "masked.store needs a mask!"); diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index dbe8e18..d91923b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -507,7 +507,9 @@ static Register buildLoadInst(SPIRVType *BaseType, Register PtrRegister, static Register buildBuiltinVariableLoad( MachineIRBuilder &MIRBuilder, SPIRVType *VariableType, SPIRVGlobalRegistry *GR, SPIRV::BuiltIn::BuiltIn BuiltinValue, LLT LLType, - Register Reg = Register(0), bool isConst = true, bool hasLinkageTy = true) { + Register Reg = Register(0), bool isConst = true, + const std::optional<SPIRV::LinkageType::LinkageType> &LinkageTy = { + SPIRV::LinkageType::Import}) { Register NewRegister = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::pIDRegClass); MIRBuilder.getMRI()->setType( @@ -521,9 +523,8 @@ static Register buildBuiltinVariableLoad( // Set up the global OpVariable with the necessary builtin decorations. Register Variable = GR->buildGlobalVariable( NewRegister, PtrType, getLinkStringForBuiltIn(BuiltinValue), nullptr, - SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst, - /* HasLinkageTy */ hasLinkageTy, SPIRV::LinkageType::Import, MIRBuilder, - false); + SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst, LinkageTy, + MIRBuilder, false); // Load the value from the global variable. Register LoadedRegister = @@ -1851,7 +1852,7 @@ static bool generateWaveInst(const SPIRV::IncomingCall *Call, return buildBuiltinVariableLoad( MIRBuilder, Call->ReturnType, GR, Value, LLType, Call->ReturnRegister, - /* isConst= */ false, /* hasLinkageTy= */ false); + /* isConst= */ false, /* LinkageType= */ std::nullopt); } // We expect a builtin diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 1a7c02c..9e11c3a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -479,19 +479,9 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, .addImm(static_cast<uint32_t>(getExecutionModel(*ST, F))) .addUse(FuncVReg); addStringImm(F.getName(), MIB); - } else if (F.getLinkage() != GlobalValue::InternalLinkage && - F.getLinkage() != GlobalValue::PrivateLinkage && - F.getVisibility() != GlobalValue::HiddenVisibility) { - SPIRV::LinkageType::LinkageType LnkTy = - F.isDeclaration() - ? SPIRV::LinkageType::Import - : (F.getLinkage() == GlobalValue::LinkOnceODRLinkage && - ST->canUseExtension( - SPIRV::Extension::SPV_KHR_linkonce_odr) - ? SPIRV::LinkageType::LinkOnceODR - : SPIRV::LinkageType::Export); + } else if (const auto LnkTy = getSpirvLinkageTypeFor(*ST, F)) { buildOpDecorate(FuncVReg, MIRBuilder, SPIRV::Decoration::LinkageAttributes, - {static_cast<uint32_t>(LnkTy)}, F.getName()); + {static_cast<uint32_t>(*LnkTy)}, F.getName()); } // Handle function pointers decoration diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6fd1c7e..6181abb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -712,9 +712,9 @@ SPIRVGlobalRegistry::buildConstantSampler(Register ResReg, unsigned AddrMode, Register SPIRVGlobalRegistry::buildGlobalVariable( Register ResVReg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV, SPIRV::StorageClass::StorageClass Storage, - const MachineInstr *Init, bool IsConst, bool HasLinkageTy, - SPIRV::LinkageType::LinkageType LinkageType, MachineIRBuilder &MIRBuilder, - bool IsInstSelector) { + const MachineInstr *Init, bool IsConst, + const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType, + MachineIRBuilder &MIRBuilder, bool IsInstSelector) { const GlobalVariable *GVar = nullptr; if (GV) { GVar = cast<const GlobalVariable>(GV); @@ -792,9 +792,9 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Alignment, {Alignment}); } - if (HasLinkageTy) + if (LinkageType) buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::LinkageAttributes, - {static_cast<uint32_t>(LinkageType)}, Name); + {static_cast<uint32_t>(*LinkageType)}, Name); SPIRV::BuiltIn::BuiltIn BuiltInId; if (getSpirvBuiltInIdByName(Name, BuiltInId)) @@ -821,8 +821,8 @@ Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding( MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass); buildGlobalVariable(VarReg, VarType, Name, nullptr, - getPointerStorageClass(VarType), nullptr, false, false, - SPIRV::LinkageType::Import, MIRBuilder, false); + getPointerStorageClass(VarType), nullptr, false, + std::nullopt, MIRBuilder, false); buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::DescriptorSet, {Set}); buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::Binding, {Binding}); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index a648def..c230e62 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -548,14 +548,12 @@ public: MachineIRBuilder &MIRBuilder); Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType, const SPIRVInstrInfo &TII); - Register buildGlobalVariable(Register Reg, SPIRVType *BaseType, - StringRef Name, const GlobalValue *GV, - SPIRV::StorageClass::StorageClass Storage, - const MachineInstr *Init, bool IsConst, - bool HasLinkageTy, - SPIRV::LinkageType::LinkageType LinkageType, - MachineIRBuilder &MIRBuilder, - bool IsInstSelector); + Register buildGlobalVariable( + Register Reg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV, + SPIRV::StorageClass::StorageClass Storage, const MachineInstr *Init, + bool IsConst, + const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType, + MachineIRBuilder &MIRBuilder, bool IsInstSelector); Register getOrCreateGlobalVariableWithBinding(const SPIRVType *VarType, uint32_t Set, uint32_t Binding, StringRef Name, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index a0cff4d..5591d9f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -4350,15 +4350,8 @@ bool SPIRVInstructionSelector::selectGlobalValue( if (hasInitializer(GlobalVar) && !Init) return true; - bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() && - !GV->hasHiddenVisibility(); - SPIRV::LinkageType::LinkageType LnkType = - GV->isDeclarationForLinker() - ? SPIRV::LinkageType::Import - : (GV->hasLinkOnceODRLinkage() && - STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr) - ? SPIRV::LinkageType::LinkOnceODR - : SPIRV::LinkageType::Export); + const std::optional<SPIRV::LinkageType::LinkageType> LnkType = + getSpirvLinkageTypeFor(STI, *GV); const unsigned AddrSpace = GV->getAddressSpace(); SPIRV::StorageClass::StorageClass StorageClass = @@ -4366,7 +4359,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(GVType, I, StorageClass); Register Reg = GR.buildGlobalVariable( ResVReg, ResType, GlobalIdent, GV, StorageClass, Init, - GlobalVar->isConstant(), HasLnkTy, LnkType, MIRBuilder, true); + GlobalVar->isConstant(), LnkType, MIRBuilder, true); return Reg.isValid(); } @@ -4517,8 +4510,8 @@ bool SPIRVInstructionSelector::loadVec3BuiltinInputID( // builtin variable. Register Variable = GR.buildGlobalVariable( NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr, - SPIRV::StorageClass::Input, nullptr, true, false, - SPIRV::LinkageType::Import, MIRBuilder, false); + SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder, + false); // Create new register for loading value. MachineRegisterInfo *MRI = MIRBuilder.getMRI(); @@ -4570,8 +4563,8 @@ bool SPIRVInstructionSelector::loadBuiltinInputID( // builtin variable. Register Variable = GR.buildGlobalVariable( NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr, - SPIRV::StorageClass::Input, nullptr, true, false, - SPIRV::LinkageType::Import, MIRBuilder, false); + SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder, + false); // Load uint value from the global variable. auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad)) diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 61a0bbe..f7cdfcb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -547,9 +547,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI, if (MI.getOpcode() == SPIRV::OpDecorate) { // If it's got Import linkage. auto Dec = MI.getOperand(1).getImm(); - if (Dec == static_cast<unsigned>(SPIRV::Decoration::LinkageAttributes)) { + if (Dec == SPIRV::Decoration::LinkageAttributes) { auto Lnk = MI.getOperand(MI.getNumOperands() - 1).getImm(); - if (Lnk == static_cast<unsigned>(SPIRV::LinkageType::Import)) { + if (Lnk == SPIRV::LinkageType::Import) { // Map imported function name to function ID register. const Function *ImportedFunc = F->getParent()->getFunction(getStringImm(MI, 2)); @@ -635,7 +635,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { InstrTraces IS; for (auto F = M.begin(), E = M.end(); F != E; ++F) { - if ((*F).isDeclaration()) + if (F->isDeclaration()) continue; MachineFunction *MF = MMI->getMachineFunction(*F); assert(MF); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index d8376cd..2d19f6de 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -169,9 +169,7 @@ struct ModuleAnalysisInfo { MCRegister getFuncReg(const Function *F) { assert(F && "Function is null"); - auto FuncPtrRegPair = FuncMap.find(F); - return FuncPtrRegPair == FuncMap.end() ? MCRegister() - : FuncPtrRegPair->second; + return FuncMap.lookup(F); } MCRegister getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; } InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 1d47c89..4e2cc88 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -1040,4 +1040,19 @@ getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) { : VarPos; } +std::optional<SPIRV::LinkageType::LinkageType> +getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV) { + if (GV.hasLocalLinkage() || GV.hasHiddenVisibility()) + return std::nullopt; + + if (GV.isDeclarationForLinker()) + return SPIRV::LinkageType::Import; + + if (GV.hasLinkOnceODRLinkage() && + ST.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr)) + return SPIRV::LinkageType::LinkOnceODR; + + return SPIRV::LinkageType::Export; +} + } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index 5777a24..99d9d40 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -559,5 +559,8 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI, const MachineInstr *ResType); MachineBasicBlock::iterator getFirstValidInstructionInsertPoint(MachineBasicBlock &BB); + +std::optional<SPIRV::LinkageType::LinkageType> +getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index ed54404d..7840620 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1583,11 +1583,9 @@ def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$lhs), // MLA: v16i8 -> v4i32 def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs), (v16i8 V128:$rhs))), - (ADD_I32x4 (ADD_I32x4 (DOT (extend_low_s_I16x8 $lhs), - (extend_low_s_I16x8 $rhs)), - (DOT (extend_high_s_I16x8 $lhs), - (extend_high_s_I16x8 $rhs))), - $acc)>; + (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_s_I32x4 (EXTMUL_LOW_S_I16x8 $lhs, $rhs)), + (extadd_pairwise_s_I32x4 (EXTMUL_HIGH_S_I16x8 $lhs, $rhs))), + $acc)>; def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$lhs), (v16i8 V128:$rhs))), (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_u_I32x4 (EXTMUL_LOW_U_I16x8 $lhs, $rhs)), diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 100f1ec..53ec712 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -1879,28 +1879,34 @@ bool X86InstructionSelector::selectSelect(MachineInstr &I, unsigned OpCmp; LLT Ty = MRI.getType(DstReg); - switch (Ty.getSizeInBits()) { - default: - return false; - case 8: - OpCmp = X86::CMOV_GR8; - break; - case 16: - OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16; - break; - case 32: - OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32; - break; - case 64: - assert(STI.is64Bit() && STI.canUseCMOV()); - OpCmp = X86::CMOV64rr; - break; + if (Ty.getSizeInBits() == 80) { + BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(X86::CMOVE_Fp80), + DstReg) + .addReg(Sel.getTrueReg()) + .addReg(Sel.getFalseReg()); + } else { + switch (Ty.getSizeInBits()) { + default: + return false; + case 8: + OpCmp = X86::CMOV_GR8; + break; + case 16: + OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16; + break; + case 32: + OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32; + break; + case 64: + assert(STI.is64Bit() && STI.canUseCMOV()); + OpCmp = X86::CMOV64rr; + break; + } + BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg) + .addReg(Sel.getTrueReg()) + .addReg(Sel.getFalseReg()) + .addImm(X86::COND_E); } - BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg) - .addReg(Sel.getTrueReg()) - .addReg(Sel.getFalseReg()) - .addImm(X86::COND_E); - const TargetRegisterClass *DstRC = getRegClass(Ty, DstReg, MRI); if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain CMOV\n"); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 28fa2cd..e792b1b 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -575,10 +575,13 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, // todo: vectors and address spaces getActionDefinitionsBuilder(G_SELECT) - .legalFor({{s8, s32}, {s16, s32}, {s32, s32}, {s64, s32}, {p0, s32}}) + .legalFor({{s16, s32}, {s32, s32}, {p0, s32}}) + .legalFor(!HasCMOV, {{s8, s32}}) + .legalFor(Is64Bit, {{s64, s32}}) + .legalFor(UseX87, {{s80, s32}}) + .clampScalar(1, s32, s32) .widenScalarToNextPow2(0, /*Min=*/8) - .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar) - .clampScalar(1, s32, s32); + .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar); // memory intrinsics getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8e08d16..a1fd366 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1164,7 +1164,6 @@ def ProcessorFeatures { FeatureAVXNECONVERT, FeatureAVXVNNIINT8, FeatureAVXVNNIINT16, - FeatureUSERMSR, FeatureSHA512, FeatureSM3, FeatureEGPR, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b05d7c7..b5f8ee5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41846,7 +41846,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); - Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); + Imm = llvm::rotl<uint8_t>(Imm, 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, DAG.getTargetConstant(Imm, DL, MVT::i8)); }; diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index 08944e6..7882045 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -235,16 +235,16 @@ ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(ARM::FPUKind FPUKind) { StringRef ARM::getFPUSynonym(StringRef FPU) { return StringSwitch<StringRef>(FPU) - .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported + .Cases({"fpa", "fpe2", "fpe3", "maverick"}, "invalid") // Unsupported .Case("vfp2", "vfpv2") .Case("vfp3", "vfpv3") .Case("vfp4", "vfpv4") .Case("vfp3-d16", "vfpv3-d16") .Case("vfp4-d16", "vfpv4-d16") - .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16") - .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16") + .Cases({"fp4-sp-d16", "vfpv4-sp-d16"}, "fpv4-sp-d16") + .Cases({"fp4-dp-d16", "fpv4-dp-d16"}, "vfpv4-d16") .Case("fp5-sp-d16", "fpv5-sp-d16") - .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16") + .Cases({"fp5-dp-d16", "fpv5-dp-d16"}, "fpv5-d16") // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3. .Case("neon-vfpv3", "neon") .Default(FPU); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index dd13ce3..b13c795 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -143,8 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids = FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS | - FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | - FeatureAMX_TRANSPOSE | FeatureUSERMSR; + FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index e1e24a9..dab200d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -289,12 +289,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) { // * Narrow width by halfs excluding zero/undef lanes Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) { Value *LoadPtr = II.getArgOperand(0); - const Align Alignment = - cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); + const Align Alignment = II.getParamAlign(0).valueOrOne(); // If the mask is all ones or undefs, this is a plain vector load of the 1st // argument. - if (maskIsAllOneOrUndef(II.getArgOperand(2))) { + if (maskIsAllOneOrUndef(II.getArgOperand(1))) { LoadInst *L = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, "unmaskedload"); L->copyMetadata(II); @@ -308,7 +307,7 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) { LoadInst *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, "unmaskedload"); LI->copyMetadata(II); - return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); + return Builder.CreateSelect(II.getArgOperand(1), LI, II.getArgOperand(2)); } return nullptr; @@ -319,8 +318,8 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) { // * Narrow width by halfs excluding zero/undef lanes Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { Value *StorePtr = II.getArgOperand(1); - Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); - auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); + Align Alignment = II.getParamAlign(1).valueOrOne(); + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); if (!ConstMask) return nullptr; @@ -356,7 +355,7 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { // * Narrow width by halfs excluding zero/undef lanes // * Vector incrementing address -> vector masked load Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { - auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(1)); if (!ConstMask) return nullptr; @@ -366,8 +365,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { if (ConstMask->isAllOnesValue()) if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) { auto *VecTy = cast<VectorType>(II.getType()); - const Align Alignment = - cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); + const Align Alignment = II.getParamAlign(0).valueOrOne(); LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr, Alignment, "load.scalar"); Value *Shuf = @@ -384,7 +382,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { // * Narrow store width by halfs excluding zero/undef lanes // * Vector incrementing address -> vector masked store Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { - auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); if (!ConstMask) return nullptr; @@ -397,8 +395,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) { if (maskContainsAllOneOrUndef(ConstMask)) { - Align Alignment = - cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + Align Alignment = II.getParamAlign(1).valueOrOne(); StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment); S->copyMetadata(II); @@ -408,7 +405,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { // scatter(vector, splat(ptr), splat(true)) -> store extract(vector, // lastlane), ptr if (ConstMask->isAllOnesValue()) { - Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + Align Alignment = II.getParamAlign(1).valueOrOne(); VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType()); ElementCount VF = WideLoadTy->getElementCount(); Value *RunTimeVF = Builder.CreateElementCount(Builder.getInt32Ty(), VF); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index a8eb9b9..975498f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4501,24 +4501,24 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (Value *V = foldSelectIntoAddConstant(SI, Builder)) return replaceInstUsesWith(SI, V); - // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0) + // select(mask, mload(ptr,mask,0), 0) -> mload(ptr,mask,0) // Load inst is intentionally not checked for hasOneUse() if (match(FalseVal, m_Zero()) && - (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal), + (match(TrueVal, m_MaskedLoad(m_Value(), m_Specific(CondVal), m_CombineOr(m_Undef(), m_Zero()))) || - match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal), + match(TrueVal, m_MaskedGather(m_Value(), m_Specific(CondVal), m_CombineOr(m_Undef(), m_Zero()))))) { auto *MaskedInst = cast<IntrinsicInst>(TrueVal); - if (isa<UndefValue>(MaskedInst->getArgOperand(3))) - MaskedInst->setArgOperand(3, FalseVal /* Zero */); + if (isa<UndefValue>(MaskedInst->getArgOperand(2))) + MaskedInst->setArgOperand(2, FalseVal /* Zero */); return replaceInstUsesWith(SI, MaskedInst); } Value *Mask; if (match(TrueVal, m_Zero()) && - (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask), + (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(Mask), m_CombineOr(m_Undef(), m_Zero()))) || - match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask), + match(FalseVal, m_MaskedGather(m_Value(), m_Value(Mask), m_CombineOr(m_Undef(), m_Zero())))) && (CondVal->getType() == Mask->getType())) { // We can remove the select by ensuring the load zeros all lanes the @@ -4531,8 +4531,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (CanMergeSelectIntoLoad) { auto *MaskedInst = cast<IntrinsicInst>(FalseVal); - if (isa<UndefValue>(MaskedInst->getArgOperand(3))) - MaskedInst->setArgOperand(3, TrueVal /* Zero */); + if (isa<UndefValue>(MaskedInst->getArgOperand(2))) + MaskedInst->setArgOperand(2, TrueVal /* Zero */); return replaceInstUsesWith(SI, MaskedInst); } } @@ -4671,14 +4671,13 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { } Value *MaskedLoadPtr; - const APInt *MaskedLoadAlignment; if (match(TrueVal, m_OneUse(m_MaskedLoad(m_Value(MaskedLoadPtr), - m_APInt(MaskedLoadAlignment), m_Specific(CondVal), m_Value())))) return replaceInstUsesWith( - SI, Builder.CreateMaskedLoad(TrueVal->getType(), MaskedLoadPtr, - Align(MaskedLoadAlignment->getZExtValue()), - CondVal, FalseVal)); + SI, Builder.CreateMaskedLoad( + TrueVal->getType(), MaskedLoadPtr, + cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(), + CondVal, FalseVal)); return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index a330bb7..651e305 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1892,7 +1892,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, // segfaults which didn't exist in the original program. APInt DemandedPtrs(APInt::getAllOnes(VWidth)), DemandedPassThrough(DemandedElts); - if (auto *CMask = dyn_cast<Constant>(II->getOperand(2))) { + if (auto *CMask = dyn_cast<Constant>(II->getOperand(1))) { for (unsigned i = 0; i < VWidth; i++) { if (Constant *CElt = CMask->getAggregateElement(i)) { if (CElt->isNullValue()) @@ -1905,7 +1905,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, if (II->getIntrinsicID() == Intrinsic::masked_gather) simplifyAndSetOp(II, 0, DemandedPtrs, PoisonElts2); - simplifyAndSetOp(II, 3, DemandedPassThrough, PoisonElts3); + simplifyAndSetOp(II, 2, DemandedPassThrough, PoisonElts3); // Output elements are undefined if the element from both sources are. // TODO: can strengthen via mask as well. diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 2646334..cb6ca72 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1494,11 +1494,8 @@ void AddressSanitizer::getInterestingMemoryOperands( if (ignoreAccess(I, BasePtr)) return; Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType(); - MaybeAlign Alignment = Align(1); - // Otherwise no alignment guarantees. We probably got Undef. - if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) - Alignment = Op->getMaybeAlignValue(); - Value *Mask = CI->getOperand(2 + OpOffset); + MaybeAlign Alignment = CI->getParamAlign(0); + Value *Mask = CI->getOperand(1 + OpOffset); Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask); break; } diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp index 3ae771a..3c0f185 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp @@ -338,7 +338,7 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { } auto *BasePtr = CI->getOperand(0 + OpOffset); - Access.MaybeMask = CI->getOperand(2 + OpOffset); + Access.MaybeMask = CI->getOperand(1 + OpOffset); Access.Addr = BasePtr; } } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index eff6f0c..b6cbecb 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4191,10 +4191,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleMaskedGather(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptrs = I.getArgOperand(0); - const Align Alignment( - cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); - Value *Mask = I.getArgOperand(2); - Value *PassThru = I.getArgOperand(3); + const Align Alignment = I.getParamAlign(0).valueOrOne(); + Value *Mask = I.getArgOperand(1); + Value *PassThru = I.getArgOperand(2); Type *PtrsShadowTy = getShadowTy(Ptrs); if (ClCheckAccessAddress) { @@ -4230,9 +4229,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Values = I.getArgOperand(0); Value *Ptrs = I.getArgOperand(1); - const Align Alignment( - cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()); - Value *Mask = I.getArgOperand(3); + const Align Alignment = I.getParamAlign(1).valueOrOne(); + Value *Mask = I.getArgOperand(2); Type *PtrsShadowTy = getShadowTy(Ptrs); if (ClCheckAccessAddress) { @@ -4262,9 +4260,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *V = I.getArgOperand(0); Value *Ptr = I.getArgOperand(1); - const Align Alignment( - cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()); - Value *Mask = I.getArgOperand(3); + const Align Alignment = I.getParamAlign(1).valueOrOne(); + Value *Mask = I.getArgOperand(2); Value *Shadow = getShadow(V); if (ClCheckAccessAddress) { @@ -4295,10 +4292,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleMaskedLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptr = I.getArgOperand(0); - const Align Alignment( - cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); - Value *Mask = I.getArgOperand(2); - Value *PassThru = I.getArgOperand(3); + const Align Alignment = I.getParamAlign(0).valueOrOne(); + Value *Mask = I.getArgOperand(1); + Value *PassThru = I.getArgOperand(2); if (ClCheckAccessAddress) { insertCheckShadowOf(Ptr, &I); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 6141b6d..4ac1321 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -272,7 +272,7 @@ static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI, if (KillingII->getIntrinsicID() == Intrinsic::masked_store) { // Masks. // TODO: check that KillingII's mask is a superset of the DeadII's mask. - if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3)) + if (KillingII->getArgOperand(2) != DeadII->getArgOperand(2)) return OW_Unknown; } else if (KillingII->getIntrinsicID() == Intrinsic::vp_store) { // Masks. diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 2afa7b7..e30f306 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -1017,14 +1017,14 @@ private: }; auto MaskOp = [](const IntrinsicInst *II) { if (II->getIntrinsicID() == Intrinsic::masked_load) - return II->getOperand(2); + return II->getOperand(1); if (II->getIntrinsicID() == Intrinsic::masked_store) - return II->getOperand(3); + return II->getOperand(2); llvm_unreachable("Unexpected IntrinsicInst"); }; auto ThruOp = [](const IntrinsicInst *II) { if (II->getIntrinsicID() == Intrinsic::masked_load) - return II->getOperand(3); + return II->getOperand(2); llvm_unreachable("Unexpected IntrinsicInst"); }; diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 42db424..72e1131 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2212,11 +2212,11 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) { if (!DepInst || !Dep.isLocal() || !Dep.isDef()) return false; - Value *Mask = I->getOperand(2); - Value *Passthrough = I->getOperand(3); + Value *Mask = I->getOperand(1); + Value *Passthrough = I->getOperand(2); Value *StoreVal; - if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), - m_Specific(Mask))) || + if (!match(DepInst, + m_MaskedStore(m_Value(StoreVal), m_Value(), m_Specific(Mask))) || StoreVal->getType() != I->getType()) return false; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index b9534def..a06f832 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -430,6 +430,7 @@ public: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::BitCast: case Instruction::AddrSpaceCast: diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index 995b803..39751c0 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -45,25 +45,20 @@ static bool tryToImproveAlign( switch (II->getIntrinsicID()) { case Intrinsic::masked_load: case Intrinsic::masked_store: { - int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2; - Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load - ? II->getArgOperand(0) - : II->getArgOperand(1); + unsigned PtrOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; + Value *PtrOp = II->getArgOperand(PtrOpIdx); Type *Type = II->getIntrinsicID() == Intrinsic::masked_load ? II->getType() : II->getArgOperand(0)->getType(); - Align OldAlign = - cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue(); + Align OldAlign = II->getParamAlign(PtrOpIdx).valueOrOne(); Align PrefAlign = DL.getPrefTypeAlign(Type); Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); - if (NewAlign <= OldAlign || - NewAlign.value() > std::numeric_limits<uint32_t>().max()) + if (NewAlign <= OldAlign) return false; - Value *V = - ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value()); - II->setOperand(AlignOpIdx, V); + II->addParamAttr(PtrOpIdx, + Attribute::getWithAlignment(II->getContext(), NewAlign)); return true; } default: diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 28ae4f0..9aaf6a5 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -43,6 +43,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> #include <utility> @@ -1872,6 +1873,51 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch); } +/// This deals with a corner case when a LCSSA phi node appears in a non-exit +/// block: the outer loop latch block does not need to be exit block of the +/// inner loop. Consider a loop that was in LCSSA form, but then some +/// transformation like loop-unswitch comes along and creates an empty block, +/// where BB5 in this example is the outer loop latch block: +/// +/// BB4: +/// br label %BB5 +/// BB5: +/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ] +/// br outer.header +/// +/// Interchange then brings it in LCSSA form again resulting in this chain of +/// single-input phi nodes: +/// +/// BB4: +/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ] +/// br label %BB5 +/// BB5: +/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ] +/// +/// The problem is that interchange can reoder blocks BB4 and BB5 placing the +/// use before the def if we don't check this. The solution is to simplify +/// lcssa phi nodes (remove) if they appear in non-exit blocks. +/// +static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + // Do not modify lcssa phis where they actually belong, i.e. in exit blocks. + if (OuterLoopLatch == InnerLoopExit) + return; + + // Collect and remove phis in non-exit blocks if they have 1 input. + SmallVector<PHINode *, 8> Phis( + llvm::make_pointer_range(OuterLoopLatch->phis())); + for (PHINode *Phi : Phis) { + assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected"); + LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi + << "\n"); + Phi->replaceAllUsesWith(Phi->getIncomingValue(0)); + Phi->eraseFromParent(); + } +} + bool LoopInterchangeTransform::adjustLoopBranches() { LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); std::vector<DominatorTree::UpdateType> DTUpdates; @@ -1882,6 +1928,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() { assert(OuterLoopPreHeader != OuterLoop->getHeader() && InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && InnerLoopPreHeader && "Guaranteed by loop-simplify form"); + + simplifyLCSSAPhis(OuterLoop, InnerLoop); + // Ensure that both preheaders do not contain PHI nodes and have single // predecessors. This allows us to move them easily. We use // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 42d6680..146e7d1 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -111,7 +111,7 @@ static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth, } // Translate a masked load intrinsic like -// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, // <16 x i1> %mask, <16 x i32> %passthru) // to a chain of basic blocks, with loading element one-by-one if // the appropriate mask bit is set @@ -146,11 +146,10 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Ptr = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); + Value *Mask = CI->getArgOperand(1); + Value *Src0 = CI->getArgOperand(2); - const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue(); + const Align AlignVal = CI->getParamAlign(0).valueOrOne(); VectorType *VecType = cast<FixedVectorType>(CI->getType()); Type *EltTy = VecType->getElementType(); @@ -290,7 +289,7 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence, } // Translate a masked store intrinsic, like -// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, // <16 x i1> %mask) // to a chain of basic blocks, that stores element one-by-one if // the appropriate mask bit is set @@ -320,10 +319,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence, bool &ModifiedDT) { Value *Src = CI->getArgOperand(0); Value *Ptr = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); + Value *Mask = CI->getArgOperand(2); - const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue(); + const Align AlignVal = CI->getParamAlign(1).valueOrOne(); auto *VecType = cast<VectorType>(Src->getType()); Type *EltTy = VecType->getElementType(); @@ -472,9 +470,8 @@ static void scalarizeMaskedGather(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Ptrs = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); + Value *Mask = CI->getArgOperand(1); + Value *Src0 = CI->getArgOperand(2); auto *VecType = cast<FixedVectorType>(CI->getType()); Type *EltTy = VecType->getElementType(); @@ -483,7 +480,7 @@ static void scalarizeMaskedGather(const DataLayout &DL, Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); - MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue(); + Align AlignVal = CI->getParamAlign(0).valueOrOne(); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); @@ -608,8 +605,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Src = CI->getArgOperand(0); Value *Ptrs = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); + Value *Mask = CI->getArgOperand(2); auto *SrcFVTy = cast<FixedVectorType>(Src->getType()); @@ -623,7 +619,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL, Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue(); + Align AlignVal = CI->getParamAlign(1).valueOrOne(); unsigned VectorWidth = SrcFVTy->getNumElements(); // Shorten the way if the mask is a vector of constants. @@ -1125,8 +1121,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, case Intrinsic::masked_load: // Scalarize unsupported vector masked load if (TTI.isLegalMaskedLoad( - CI->getType(), - cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(), + CI->getType(), CI->getParamAlign(0).valueOrOne(), cast<PointerType>(CI->getArgOperand(0)->getType()) ->getAddressSpace())) return false; @@ -1135,18 +1130,15 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, case Intrinsic::masked_store: if (TTI.isLegalMaskedStore( CI->getArgOperand(0)->getType(), - cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(), + CI->getParamAlign(1).valueOrOne(), cast<PointerType>(CI->getArgOperand(1)->getType()) ->getAddressSpace())) return false; scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_gather: { - MaybeAlign MA = - cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(); + Align Alignment = CI->getParamAlign(0).valueOrOne(); Type *LoadTy = CI->getType(); - Align Alignment = DL.getValueOrABITypeAlignment(MA, - LoadTy->getScalarType()); if (TTI.isLegalMaskedGather(LoadTy, Alignment) && !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment)) return false; @@ -1154,11 +1146,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, return true; } case Intrinsic::masked_scatter: { - MaybeAlign MA = - cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(); + Align Alignment = CI->getParamAlign(1).valueOrOne(); Type *StoreTy = CI->getArgOperand(0)->getType(); - Align Alignment = DL.getValueOrABITypeAlignment(MA, - StoreTy->getScalarType()); if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy), Alignment)) diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index fa66a03..23e1243 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -227,6 +227,7 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I, case Instruction::Call: case Instruction::BitCast: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::AddrSpaceCast: case Instruction::FPToUI: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 7651ba1..3fed003 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -325,6 +325,8 @@ public: VPIRFlags Flags; if (Opcode == Instruction::Trunc) Flags = VPIRFlags::TruncFlagsTy(false, false); + else if (Opcode == Instruction::ZExt) + Flags = VPIRFlags::NonNegFlagsTy(false); return tryInsertInstruction( new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags)); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 280eb20..febdc54 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks, + BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9cd52da..3f18bd7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5343,7 +5343,7 @@ private: unsigned &OpCnt = OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); EdgeInfo EI(TE, U.getOperandNo()); - if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + if (!getScheduleCopyableData(EI, Op)) continue; // Found copyable operand - continue. ++OpCnt; @@ -10546,8 +10546,11 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL, PoisonValue::get(UniqueValues.front()->getType())); // Check that extended with poisons/copyable operations are still valid // for vectorization (div/rem are not allowed). - if (!S.areInstructionsWithCopyableElements() && - !getSameOpcode(PaddedUniqueValues, TLI).valid()) { + if ((!S.areInstructionsWithCopyableElements() && + !getSameOpcode(PaddedUniqueValues, TLI).valid()) || + (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() && + (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() || + isa<CallInst>(S.getMainOp())))) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); ReuseShuffleIndices.clear(); return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 84d2ea6..fed04eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1007,6 +1007,11 @@ public: /// Creates a fixed-width vector containing all operands. The number of /// operands matches the vector element count. BuildVector, + /// Extracts all lanes from its (non-scalable) vector operand. This is an + /// abstract VPInstruction whose single defined VPValue represents VF + /// scalars extracted from a vector, to be replaced by VF ExtractElement + /// VPInstructions. + Unpack, /// Compute the final result of a AnyOf reduction with select(cmp(),x,y), /// where one of (x,y) is loop invariant, and both x and y are integer type. ComputeAnyOfResult, @@ -2715,6 +2720,15 @@ public: return R && classof(R); } + static inline bool classof(const VPValue *VPV) { + const VPRecipeBase *R = VPV->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPSingleDefRecipe *R) { + return classof(static_cast<const VPRecipeBase *>(R)); + } + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; @@ -3100,6 +3114,9 @@ public: /// Returns true if this expression contains recipes that may have side /// effects. bool mayHaveSideEffects() const; + + /// Returns true if the result of this VPExpressionRecipe is a single-scalar. + bool isSingleScalar() const; }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 7e074c1..80a2e4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: + case VPInstruction::Unpack: return SetResultTyFromOp(); case VPInstruction::ExtractLane: return inferScalarType(R->getOperand(1)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index d8203e2..b5b98c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } +template <typename Op0_t, typename Op1_t> +inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t> +m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) { + return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1); +} + template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t> m_ExtractLastLanePerPart(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d1e67e6b..1f1b42b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: case VPInstruction::Not: + case VPInstruction::Unpack: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: case VPInstruction::VScale: + case VPInstruction::Unpack: return false; default: return true; @@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); case VPInstruction::WidePtrAdd: - return Op == getOperand(0); + // WidePtrAdd supports scalar and vector base addresses. + return false; case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); @@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ResumeForEpilogue: O << "resume-for-epilogue"; break; + case VPInstruction::Unpack: + O << "unpack"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { return false; } +bool VPExpressionRecipe::isSingleScalar() const { + // Cannot use vputils::isSingleScalar(), because all external operands + // of the expression will be live-ins while bundled. + return isa<VPReductionRecipe>(ExpressionRecipes.back()) && + !isa<VPPartialReductionRecipe>(ExpressionRecipes.back()); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, @@ -3149,7 +3162,17 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) { while (!WorkList.empty()) { auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); - if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur)) + if (!Cur || !Seen.insert(Cur).second) + continue; + + auto *Blend = dyn_cast<VPBlendRecipe>(Cur); + // Skip blends that use V only through a compare by checking if any incoming + // value was already visited. + if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()), + [&](unsigned I) { + return Seen.contains( + Blend->getIncomingValue(I)->getDefiningRecipe()); + })) continue; for (VPUser *U : Cur->users()) { @@ -3170,7 +3193,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) { } } - append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + // The legacy cost model only supports scalarization loads/stores with phi + // addresses, if the phi is directly used as load/store address. Don't + // traverse further for Blends. + if (Blend) + continue; + + append_range(WorkList, Cur->users()); } return false; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5f616f..e060e70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -106,7 +106,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( return false; NewRecipe = new VPWidenIntrinsicRecipe( *CI, getVectorIntrinsicIDForCall(CI, &TLI), - {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(), + drop_end(Ingredient.operands()), CI->getType(), CI->getDebugLoc()); } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); @@ -356,8 +356,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, // Replace predicated replicate recipe with a replicate recipe without a // mask but in the replicate region. auto *RecipeWithoutMask = new VPReplicateRecipe( - PredRecipe->getUnderlyingInstr(), - make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), + PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()), PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe); auto *Pred = Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); @@ -939,7 +938,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { continue; if (!isDeadRecipe(*R)) continue; - WorkList.append(R->op_begin(), R->op_end()); + append_range(WorkList, R->operands()); R->eraseFromParent(); } } @@ -1224,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + uint64_t Idx; + if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -2006,7 +2012,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { .Case<VPWidenIntrinsicRecipe>([](auto *I) { return std::make_pair(true, I->getVectorIntrinsicID()); }) - .Case<VPVectorPointerRecipe>([](auto *I) { + .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { // For recipes that do not directly map to LLVM IR instructions, // assign opcodes after the last VPInstruction opcode (which is also // after the last IR Instruction opcode), based on the VPDefID. @@ -2083,6 +2089,15 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { LFlags->getPredicate() != cast<VPRecipeWithIRFlags>(R)->getPredicate()) return false; + // Recipes in replicate regions implicitly depend on predicate. If either + // recipe is in a replicate region, only consider them equal if both have + // the same parent. + const VPRegionBlock *RegionL = L->getParent()->getParent(); + const VPRegionBlock *RegionR = R->getParent()->getParent(); + if (((RegionL && RegionL->isReplicator()) || + (RegionR && RegionR->isReplicator())) && + L->getParent() != R->getParent()) + return false; const VPlan *Plan = L->getParent()->getPlan(); VPTypeAnalysis TypeInfo(*Plan); return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R); @@ -3780,7 +3795,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } -void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { +void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; @@ -3828,6 +3843,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { }); } } + + // Create explicit VPInstructions to convert vectors to scalars. The current + // implementation is conservative - it may miss some cases that may or may not + // be vector values. TODO: introduce Unpacks speculatively - remove them later + // if they are known to operate on scalar values. + for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe, + VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R)) + continue; + for (VPValue *Def : R.definedValues()) { + // Skip recipes that are single-scalar or only have their first lane + // used. + // TODO: The Defs skipped here may or may not be vector values. + // Introduce Unpacks, and remove them later, if they are guaranteed to + // produce scalar values. + if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def)) + continue; + + // At the moment, we create unpacks only for scalar users outside + // replicate regions. Recipes inside replicate regions still extract the + // required lanes implicitly. + // TODO: Remove once replicate regions are unrolled completely. + auto IsCandidateUnpackUser = [Def](VPUser *U) { + VPRegionBlock *ParentRegion = + cast<VPRecipeBase>(U)->getParent()->getParent(); + return U->usesScalars(Def) && + (!ParentRegion || !ParentRegion->isReplicator()); + }; + if (none_of(Def->users(), IsCandidateUnpackUser)) + continue; + + auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def}); + if (R.isPhi()) + Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi()); + else + Unpack->insertAfter(&R); + Def->replaceUsesWithIf(Unpack, + [&IsCandidateUnpackUser](VPUser &U, unsigned) { + return IsCandidateUnpackUser(&U); + }); + } + } + } } void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bb..b28559b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -325,9 +325,10 @@ struct VPlanTransforms { static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH); - /// Add explicit Build[Struct]Vector recipes that combine multiple scalar - /// values into single vectors. - static void materializeBuildVectors(VPlan &Plan); + /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values + /// into vectors and Unpack recipes to extract scalars from vectors as + /// needed. + static void materializePacksAndUnpacks(VPlan &Plan); /// Materialize VF and VFxUF to be computed explicitly using VPInstructions. static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 5aeda3e..cfd1a74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar /// definitions for operands of \DefR. -static VPRecipeWithIRFlags * +static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPRecipeWithIRFlags *DefR, VPLane Lane, const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { + VPValue *Op; + if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) { + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) + return LaneDefs->second[Lane.getKnownLane()]; + + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + // Collect the operands at Lane, creating extracts as needed. SmallVector<VPValue *> NewOps; for (VPValue *Op : DefR->operands()) { @@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, continue; } if (Lane.getKind() == VPLane::Kind::ScalableLast) { + // Look through mandatory Unpack. + [[maybe_unused]] bool Matched = + match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op))); + assert(Matched && "original op must have been Unpack"); NewOps.push_back( Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; @@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { (isa<VPReplicateRecipe>(&R) && cast<VPReplicateRecipe>(&R)->isSingleScalar()) || (isa<VPInstruction>(&R) && - !cast<VPInstruction>(&R)->doesGeneratePerAllLanes())) + !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() && + cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack)) continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 8b1b0e5..10801c0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -113,12 +113,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return TypeSwitch<const VPRecipeBase *, bool>(R) .Case<VPDerivedIVRecipe>([](const auto *R) { return true; }) .Case<VPReplicateRecipe>([](const auto *R) { - // Loads and stores that are uniform across VF lanes are handled by - // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if - // all their operands are invariant. - // TODO: Further relax the restrictions. + // Be conservative about side-effects, except for the + // known-side-effecting assumes and stores, which we know will be + // uniform. return R->isSingleScalar() && - (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) && + (!R->mayHaveSideEffects() || + isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) && all_of(R->operands(), isUniformAcrossVFsAndUFs); }) .Case<VPInstruction>([](const auto *VPI) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9a2497e..840a5b9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); + if (isa<VPPartialReductionRecipe>(VPV)) + return false; + if (isa<VPReductionRecipe>(VPV)) + return true; + if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV)) + return Expr->isSingleScalar(); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa<VPExpandSCEVRecipe>(VPV); diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp index 1fc8974..09468da 100644 --- a/llvm/lib/WindowsDriver/MSVCPaths.cpp +++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp @@ -259,9 +259,7 @@ static bool getSystemRegistryString(const char *keyPath, const char *valueName, #endif // _WIN32 } -namespace llvm { - -const char *archToWindowsSDKArch(Triple::ArchType Arch) { +const char *llvm::archToWindowsSDKArch(Triple::ArchType Arch) { switch (Arch) { case Triple::ArchType::x86: return "x86"; @@ -277,7 +275,7 @@ const char *archToWindowsSDKArch(Triple::ArchType Arch) { } } -const char *archToLegacyVCArch(Triple::ArchType Arch) { +const char *llvm::archToLegacyVCArch(Triple::ArchType Arch) { switch (Arch) { case Triple::ArchType::x86: // x86 is default in legacy VC toolchains. @@ -295,7 +293,7 @@ const char *archToLegacyVCArch(Triple::ArchType Arch) { } } -const char *archToDevDivInternalArch(Triple::ArchType Arch) { +const char *llvm::archToDevDivInternalArch(Triple::ArchType Arch) { switch (Arch) { case Triple::ArchType::x86: return "i386"; @@ -311,8 +309,9 @@ const char *archToDevDivInternalArch(Triple::ArchType Arch) { } } -bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath, - Triple::ArchType Arch, std::string &path) { +bool llvm::appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath, + Triple::ArchType Arch, + std::string &path) { if (SDKMajor >= 8) { sys::path::append(LibPath, archToWindowsSDKArch(Arch)); } else { @@ -336,10 +335,11 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath, return true; } -std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout, - const std::string &VCToolChainPath, - Triple::ArchType TargetArch, - StringRef SubdirParent) { +std::string llvm::getSubDirectoryPath(SubDirectoryType Type, + ToolsetLayout VSLayout, + const std::string &VCToolChainPath, + Triple::ArchType TargetArch, + StringRef SubdirParent) { const char *SubdirName; const char *IncludeName; switch (VSLayout) { @@ -390,19 +390,22 @@ std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout, return std::string(Path); } -bool useUniversalCRT(ToolsetLayout VSLayout, const std::string &VCToolChainPath, - Triple::ArchType TargetArch, vfs::FileSystem &VFS) { +bool llvm::useUniversalCRT(ToolsetLayout VSLayout, + const std::string &VCToolChainPath, + Triple::ArchType TargetArch, vfs::FileSystem &VFS) { SmallString<128> TestPath(getSubDirectoryPath( SubDirectoryType::Include, VSLayout, VCToolChainPath, TargetArch)); sys::path::append(TestPath, "stdlib.h"); return !VFS.exists(TestPath); } -bool getWindowsSDKDir(vfs::FileSystem &VFS, std::optional<StringRef> WinSdkDir, - std::optional<StringRef> WinSdkVersion, - std::optional<StringRef> WinSysRoot, std::string &Path, - int &Major, std::string &WindowsSDKIncludeVersion, - std::string &WindowsSDKLibVersion) { +bool llvm::getWindowsSDKDir(vfs::FileSystem &VFS, + std::optional<StringRef> WinSdkDir, + std::optional<StringRef> WinSdkVersion, + std::optional<StringRef> WinSysRoot, + std::string &Path, int &Major, + std::string &WindowsSDKIncludeVersion, + std::string &WindowsSDKLibVersion) { // Trust /winsdkdir and /winsdkversion if present. if (getWindowsSDKDirViaCommandLine(VFS, WinSdkDir, WinSdkVersion, WinSysRoot, Path, Major, WindowsSDKIncludeVersion)) { @@ -460,11 +463,11 @@ bool getWindowsSDKDir(vfs::FileSystem &VFS, std::optional<StringRef> WinSdkDir, return false; } -bool getUniversalCRTSdkDir(vfs::FileSystem &VFS, - std::optional<StringRef> WinSdkDir, - std::optional<StringRef> WinSdkVersion, - std::optional<StringRef> WinSysRoot, - std::string &Path, std::string &UCRTVersion) { +bool llvm::getUniversalCRTSdkDir(vfs::FileSystem &VFS, + std::optional<StringRef> WinSdkDir, + std::optional<StringRef> WinSdkVersion, + std::optional<StringRef> WinSysRoot, + std::string &Path, std::string &UCRTVersion) { // If /winsdkdir is passed, use it as location for the UCRT too. // FIXME: Should there be a dedicated /ucrtdir to override /winsdkdir? int Major; @@ -491,11 +494,11 @@ bool getUniversalCRTSdkDir(vfs::FileSystem &VFS, return getWindows10SDKVersionFromPath(VFS, Path, UCRTVersion); } -bool findVCToolChainViaCommandLine(vfs::FileSystem &VFS, - std::optional<StringRef> VCToolsDir, - std::optional<StringRef> VCToolsVersion, - std::optional<StringRef> WinSysRoot, - std::string &Path, ToolsetLayout &VSLayout) { +bool llvm::findVCToolChainViaCommandLine( + vfs::FileSystem &VFS, std::optional<StringRef> VCToolsDir, + std::optional<StringRef> VCToolsVersion, + std::optional<StringRef> WinSysRoot, std::string &Path, + ToolsetLayout &VSLayout) { // Don't validate the input; trust the value supplied by the user. // The primary motivation is to prevent unnecessary file and registry access. if (VCToolsDir || WinSysRoot) { @@ -518,8 +521,9 @@ bool findVCToolChainViaCommandLine(vfs::FileSystem &VFS, return false; } -bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path, - ToolsetLayout &VSLayout) { +bool llvm::findVCToolChainViaEnvironment(vfs::FileSystem &VFS, + std::string &Path, + ToolsetLayout &VSLayout) { // These variables are typically set by vcvarsall.bat // when launching a developer command prompt. if (std::optional<std::string> VCToolsInstallDir = @@ -627,9 +631,9 @@ bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path, return false; } -bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS, - std::optional<StringRef> VCToolsVersion, - std::string &Path, ToolsetLayout &VSLayout) { +bool llvm::findVCToolChainViaSetupConfig( + vfs::FileSystem &VFS, std::optional<StringRef> VCToolsVersion, + std::string &Path, ToolsetLayout &VSLayout) { #if !defined(USE_MSVC_SETUP_API) return false; #else @@ -724,7 +728,8 @@ bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS, #endif } -bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) { +bool llvm::findVCToolChainViaRegistry(std::string &Path, + ToolsetLayout &VSLayout) { std::string VSInstallPath; if (getSystemRegistryString(R"(SOFTWARE\Microsoft\VisualStudio\$VERSION)", "InstallDir", VSInstallPath, nullptr) || @@ -744,5 +749,3 @@ bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) { } return false; } - -} // namespace llvm |