diff options
Diffstat (limited to 'llvm/lib')
25 files changed, 605 insertions, 273 deletions
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076..523374b 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb0..8930d64 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } @@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c..76494c7 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 6c78ef0..7496c5a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8..cd39970 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index b775cbb..95d61a9 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -148,18 +148,10 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { - if (!UseList) - return N == 0; - - // TODO: Disallow for ConstantData and remove !UseList check? return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { - // TODO: Disallow for ConstantData and remove !UseList check? - if (!UseList) - return N == 0; - return hasNItemsOrMore(use_begin(), use_end(), N); } diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac0..3f41618 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication( static cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableHotColdSplit("hot-cold-split", diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 1169f26..97298f9 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit); IRBuilder<> B(BB); - // Load the global symbol as a pointer to the check function. - Value *GuardFn; - if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf")) - GuardFn = GuardFnCFGlobal; - else - GuardFn = GuardFnGlobal; - LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn); - - // Create new call instruction. The CFGuard check should always be a call, - // even if the original CallBase is an Invoke or CallBr instruction. + // Create new call instruction. The call check should always be a call, + // even if the original CallBase is an Invoke or CallBr instructio. + // This is treated as a direct call, so do not use GuardFnCFGlobal. + LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal); Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes()); CallInst *GuardCheck = B.CreateCall( GuardFnType, GuardCheckLoad, {F, Thunk}); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b5565a..10f2c80 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b39546a..e3b0a1b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a..aa5ea77 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e11..abefa32 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 3a00267..6b06534 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -9869,12 +9869,32 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; + bool ShouldUseSRet = getTM().isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const Align StackAlign = DL.getPrefTypeAlign(RetTy); + int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); + + ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + Args.emplace_back(Arg, ArgTy); StringRef LibcallName = getLibcallImplName(SincosStret); @@ -9884,10 +9904,25 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)); + .setCallee(CC, RetTy, Callee, std::move(Args)) + .setDiscardResult(ShouldUseSRet); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; + if (!ShouldUseSRet) + return CallResult.first; + + SDValue LoadSin = + DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); + + // Address of cos field. + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); + SDValue LoadCos = + DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, + LoadSin.getValue(0), LoadCos.getValue(0)); } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 282cf5d..3d5a55c 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -95,7 +95,8 @@ private: void addVectorLoadStoreOperands(MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, bool IsMasked, - bool IsStrided) const; + bool IsStridedOrIndexed, + LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { void RISCVInstructionSelector::addVectorLoadStoreOperands( MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, - bool IsMasked, bool IsStrided) const { + bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const { // Base Pointer auto PtrReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(PtrReg); - // Stride - if (IsStrided) { + // Stride or Index + if (IsStridedOrIndexed) { auto StrideReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(StrideReg); + if (IndexVT) + *IndexVT = MRI->getType(StrideReg); } // Mask @@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vluxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask || + IntrinID == Intrinsic::riscv_vluxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei || + IntrinID == Intrinsic::riscv_vloxei_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + // Use NoRegister if there is no specified passthru. + SrcOps.push_back(Register()); + } + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } case Intrinsic::riscv_vsm: case Intrinsic::riscv_vse: case Intrinsic::riscv_vse_mask: @@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vsuxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask || + IntrinID == Intrinsic::riscv_vsuxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei || + IntrinID == Intrinsic::riscv_vsoxei_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 526675a..b0453fc 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index cfee6ab..5b72334 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax "true", "Enable short forward branch optimization for min,max instructions in Zbb", [TuneShortForwardBranchOpt]>; +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e0cf739..c56ce3f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(ShX, DL, VT), Mul359); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + switch (MulAmt) { + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3); + default: + return SDValue(); + } +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { + SDValue X = N->getOperand(0); int Shift; if (int ShXAmount = isShifted359(MulAmt, Shift)) { // 3/5/9 * 2^N -> shl (shXadd X, X), N SDLoc DL(N); - SDValue X = N->getOperand(0); // Put the shift first if we can fold a zext into the shift forming // a slli.uw. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -16554,38 +16582,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, } // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); - } + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt)) + return V; // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's @@ -16648,23 +16646,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, } } - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + SDLoc DL(N); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index c9df787..b8ab70b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MAXU: return RISCV::PseudoCCMAXU; case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) return nullptr; + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 5a67a5a..494b1c9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr; def PseudoCCMIN : SFBALU_rr; def PseudoCCMAXU : SFBALU_rr; def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b97b508..6edf018 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53349,40 +53349,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + !(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53390,6 +53395,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53397,6 +53409,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53408,16 +53421,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); + + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), Align(), St->getMemOperand()->getFlags()); } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5ed47ae..a6ac761 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879..239526e 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch( static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), + DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e..bb73327 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7570,6 +7570,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8112,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d9bb14..2588c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) { bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; - auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, - VPSingleDefRecipe *Candidate) { - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) - return false; + return; if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || Candidate->mayReadOrWriteMemory()) - return false; + return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) if (!ScalarVFOnly && RepR->isSingleScalar()) - return false; + return; - return true; + WorkList.insert({SinkTo, Candidate}); }; // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) @@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) { - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(VPBB, Def)) - WorkList.insert({VPBB, Def}); - } - } + for (auto &Recipe : *VPBB) + for (VPValue *Op : Recipe.operands()) + InsertIfValidSinkCandidate(VPBB, Op); } // Try to sink each replicate or scalar IV steps recipe in the worklist. @@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) { VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must have only their first lane used. In + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In // the latter case, we need to duplicate SinkCandidate. auto UsersOutsideSinkTo = make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { @@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(SinkTo, Def)) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; |
