diff options
Diffstat (limited to 'llvm/lib')
225 files changed, 3220 insertions, 1779 deletions
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 6fc81d787..da76f5b 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -833,6 +833,10 @@ bool llvm::canReplacePointersInUseIfEqual(const Use &U, const Value *To, if (!To->getType()->isPointerTy()) return true; + // Do not perform replacements in lifetime intrinsic arguments. + if (isa<LifetimeIntrinsic>(U.getUser())) + return false; + if (isPointerAlwaysReplaceable(&*U, To, DL)) return true; return isPointerUseReplacable(U); diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index c8daab7a..28a2640 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -190,7 +190,21 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, return MemoryLocation::getAfter(Arg, AATags); case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: + case Intrinsic::lifetime_end: { + assert(ArgIdx == 1 && "Invalid argument index"); + auto *AI = dyn_cast<AllocaInst>(Arg); + if (!AI) + // lifetime of poison value. + return MemoryLocation::getBeforeOrAfter(Arg); + + std::optional<TypeSize> AllocSize = + AI->getAllocationSize(II->getDataLayout()); + return MemoryLocation(Arg, + AllocSize ? LocationSize::precise(*AllocSize) + : LocationSize::afterPointer(), + AATags); + } + case Intrinsic::invariant_start: assert(ArgIdx == 1 && "Invalid argument index"); return MemoryLocation( diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp index 34a7a04..abe4985 100644 --- a/llvm/lib/Analysis/StackLifetime.cpp +++ b/llvm/lib/Analysis/StackLifetime.cpp @@ -59,44 +59,20 @@ bool StackLifetime::isAliveAfter(const AllocaInst *AI, return getLiveRange(AI).test(InstNum); } -// Returns unique alloca annotated by lifetime marker only if -// markers has the same size and points to the alloca start. -static const AllocaInst *findMatchingAlloca(const IntrinsicInst &II, - const DataLayout &DL) { - const AllocaInst *AI = cast<AllocaInst>(II.getArgOperand(1)); - auto AllocaSize = AI->getAllocationSize(DL); - if (!AllocaSize) - return nullptr; - - auto *Size = dyn_cast<ConstantInt>(II.getArgOperand(0)); - if (!Size) - return nullptr; - int64_t LifetimeSize = Size->getSExtValue(); - - if (LifetimeSize != -1 && uint64_t(LifetimeSize) != *AllocaSize) - return nullptr; - - return AI; -} - void StackLifetime::collectMarkers() { InterestingAllocas.resize(NumAllocas); DenseMap<const BasicBlock *, SmallDenseMap<const IntrinsicInst *, Marker>> BBMarkerSet; - const DataLayout &DL = F.getDataLayout(); - // Compute the set of start/end markers per basic block. for (const BasicBlock *BB : depth_first(&F)) { for (const Instruction &I : *BB) { const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); if (!II || !II->isLifetimeStartOrEnd()) continue; - const AllocaInst *AI = findMatchingAlloca(*II, DL); - if (!AI) { - HasUnknownLifetimeStartOrEnd = true; + const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + if (!AI) continue; - } auto It = AllocaNumbering.find(AI); if (It == AllocaNumbering.end()) continue; @@ -325,20 +301,6 @@ StackLifetime::StackLifetime(const Function &F, } void StackLifetime::run() { - if (HasUnknownLifetimeStartOrEnd) { - // There is marker which we can't assign to a specific alloca, so we - // fallback to the most conservative results for the type. - switch (Type) { - case LivenessType::May: - LiveRanges.resize(NumAllocas, getFullLiveRange()); - break; - case LivenessType::Must: - LiveRanges.resize(NumAllocas, LiveRange(Instructions.size())); - break; - } - return; - } - LiveRanges.resize(NumAllocas, LiveRange(Instructions.size())); for (unsigned I = 0; I < NumAllocas; ++I) if (!InterestingAllocas.test(I)) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index af85ce4..1e70228 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1351,6 +1351,8 @@ static void computeKnownBitsFromOperator(const Operator *I, isa<ScalableVectorType>(I->getType())) break; + unsigned NumElts = DemandedElts.getBitWidth(); + bool IsLE = Q.DL.isLittleEndian(); // Look through a cast from narrow vector elements to wider type. // Examples: v4i32 -> v2i64, v3i8 -> v24 unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits(); @@ -1369,7 +1371,6 @@ static void computeKnownBitsFromOperator(const Operator *I, // // The known bits of each sub-element are then inserted into place // (dependent on endian) to form the full result of known bits. - unsigned NumElts = DemandedElts.getBitWidth(); unsigned SubScale = BitWidth / SubBitWidth; APInt SubDemandedElts = APInt::getZero(NumElts * SubScale); for (unsigned i = 0; i != NumElts; ++i) { @@ -1381,10 +1382,32 @@ static void computeKnownBitsFromOperator(const Operator *I, for (unsigned i = 0; i != SubScale; ++i) { computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc, Q, Depth + 1); - unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i; + unsigned ShiftElt = IsLE ? i : SubScale - 1 - i; Known.insertBits(KnownSrc, ShiftElt * SubBitWidth); } } + // Look through a cast from wider vector elements to narrow type. + // Examples: v2i64 -> v4i32 + if (SubBitWidth % BitWidth == 0) { + unsigned SubScale = SubBitWidth / BitWidth; + KnownBits KnownSrc(SubBitWidth); + APInt SubDemandedElts = + APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale); + computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Q, + Depth + 1); + + Known.Zero.setAllBits(); + Known.One.setAllBits(); + for (unsigned i = 0; i != NumElts; ++i) { + if (DemandedElts[i]) { + unsigned Shifts = IsLE ? i : NumElts - 1 - i; + unsigned Offset = (Shifts % SubScale) * BitWidth; + Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset)); + if (Known.isUnknown()) + break; + } + } + } break; } case Instruction::SExt: { diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7e0d81f..05680fa 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -610,7 +610,7 @@ public: std::vector<StringRef> ModulePaths; for (auto &[ModPath, _] : Index.modulePaths()) ModulePaths.push_back(ModPath); - llvm::sort(ModulePaths.begin(), ModulePaths.end()); + llvm::sort(ModulePaths); for (auto &ModPath : ModulePaths) Callback(*Index.modulePaths().find(ModPath)); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 1641c3e..c72b6e8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3194,7 +3194,7 @@ void AsmPrinter::emitJumpTableSizesSection(const MachineJumpTableInfo &MJTI, return; if (isElf) { - MCSymbolELF *LinkedToSym = dyn_cast<MCSymbolELF>(CurrentFnSym); + auto *LinkedToSym = static_cast<MCSymbolELF *>(CurrentFnSym); int Flags = F.hasComdat() ? static_cast<int>(ELF::SHF_GROUP) : 0; JumpTableSizesSection = OutContext.getELFSection( @@ -4702,7 +4702,7 @@ void AsmPrinter::emitXRayTable() { const Triple &TT = TM.getTargetTriple(); // Use PC-relative addresses on all targets. if (TT.isOSBinFormatELF()) { - auto LinkedToSym = cast<MCSymbolELF>(CurrentFnSym); + auto LinkedToSym = static_cast<const MCSymbolELF *>(CurrentFnSym); auto Flags = ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER; StringRef GroupName; if (F.hasComdat()) { @@ -4825,7 +4825,7 @@ void AsmPrinter::emitPatchableFunctionEntries() { Flags |= ELF::SHF_GROUP; GroupName = F.getComdat()->getName(); } - LinkedToSym = cast<MCSymbolELF>(CurrentFnSym); + LinkedToSym = static_cast<const MCSymbolELF *>(CurrentFnSym); } OutStreamer->switchSection(OutContext.getELFSection( SectionName, ELF::SHT_PROGBITS, Flags, 0, GroupName, F.hasComdat(), diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp index 08ed78e..a7491a2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -230,7 +230,7 @@ void DIEHash::hashBlockData(const DIE::const_value_range &Values) { "Base types referenced from DW_OP_convert should have a name"); hashNestedType(C, Name); } else - Hash.update((uint64_t)V.getDIEInteger().getValue()); + Hash.update(V.getDIEInteger().getValue()); } // Hash the contents of a loclistptr class. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 5577a7d..f9d7e76 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -508,7 +508,8 @@ void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName, // don't want to depend on target specific headers in this code? const unsigned TI_GLOBAL_RELOC = 3; unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - auto *Sym = cast<MCSymbolWasm>(Asm->GetExternalSymbolSymbol(GlobalName)); + auto *Sym = + static_cast<MCSymbolWasm *>(Asm->GetExternalSymbolSymbol(GlobalName)); // FIXME: this repeats what WebAssemblyMCInstLower:: // GetExternalSymbolSymbol does, since if there's no code that // refers to this symbol, we have to set it here. diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index ff265b5..260ce8f 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -1403,7 +1403,7 @@ void AssignmentTrackingLowering::addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV) { LiveSet->setAssignment(BlockInfo::Stack, Var, AV); - // Use this assigment for all fragments contained within Var, but do not + // Use this assignment for all fragments contained within Var, but do not // provide a Source because we cannot convert Var's value to a value for the // fragment. Assignment FragAV = AV; @@ -1416,7 +1416,7 @@ void AssignmentTrackingLowering::addDbgDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV) { LiveSet->setAssignment(BlockInfo::Debug, Var, AV); - // Use this assigment for all fragments contained within Var, but do not + // Use this assignment for all fragments contained within Var, but do not // provide a Source because we cannot convert Var's value to a value for the // fragment. Assignment FragAV = AV; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index f16283b..9223739 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1834,7 +1834,7 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { /// /// Return true if any changes are made. static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { - if (TLI.hasMultipleConditionRegisters()) + if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()))) return false; // Avoid sinking soft-FP comparisons, since this can move them into a loop. diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 0f2c580..59c62cf 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -95,7 +95,7 @@ void CSEMIRBuilder::profileSrcOp(const SrcOp &Op, GISelInstProfileBuilder &B) const { switch (Op.getSrcOpKind()) { case SrcOp::SrcType::Ty_Imm: - B.addNodeIDImmediate(static_cast<int64_t>(Op.getImm())); + B.addNodeIDImmediate(Op.getImm()); break; case SrcOp::SrcType::Ty_Predicate: B.addNodeIDImmediate(static_cast<int64_t>(Op.getPredicate())); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index fd38c30..bbfae57 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1592,9 +1592,19 @@ bool IRTranslator::translateGetElementPtr(const User &U, Type *OffsetIRTy = DL->getIndexType(PtrIRTy); LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); - uint32_t Flags = 0; + uint32_t PtrAddFlags = 0; + // Each PtrAdd generated to implement the GEP inherits its nuw, nusw, inbounds + // flags. if (const Instruction *I = dyn_cast<Instruction>(&U)) - Flags = MachineInstr::copyFlagsFromInstruction(*I); + PtrAddFlags = MachineInstr::copyFlagsFromInstruction(*I); + + auto PtrAddFlagsWithConst = [&](int64_t Offset) { + // For nusw/inbounds GEP with an offset that is nonnegative when interpreted + // as signed, assume there is no unsigned overflow. + if (Offset >= 0 && (PtrAddFlags & MachineInstr::MIFlag::NoUSWrap)) + return PtrAddFlags | MachineInstr::MIFlag::NoUWrap; + return PtrAddFlags; + }; // Normalize Vector GEP - all scalar operands should be converted to the // splat vector. @@ -1644,7 +1654,9 @@ bool IRTranslator::translateGetElementPtr(const User &U, if (Offset != 0) { auto OffsetMIB = MIRBuilder.buildConstant({OffsetTy}, Offset); - BaseReg = MIRBuilder.buildPtrAdd(PtrTy, BaseReg, OffsetMIB.getReg(0)) + BaseReg = MIRBuilder + .buildPtrAdd(PtrTy, BaseReg, OffsetMIB.getReg(0), + PtrAddFlagsWithConst(Offset)) .getReg(0); Offset = 0; } @@ -1668,12 +1680,23 @@ bool IRTranslator::translateGetElementPtr(const User &U, if (ElementSize != 1) { auto ElementSizeMIB = MIRBuilder.buildConstant( getLLTForType(*OffsetIRTy, *DL), ElementSize); + + // The multiplication is NUW if the GEP is NUW and NSW if the GEP is + // NUSW. + uint32_t ScaleFlags = PtrAddFlags & MachineInstr::MIFlag::NoUWrap; + if (PtrAddFlags & MachineInstr::MIFlag::NoUSWrap) + ScaleFlags |= MachineInstr::MIFlag::NoSWrap; + GepOffsetReg = - MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB).getReg(0); - } else + MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB, ScaleFlags) + .getReg(0); + } else { GepOffsetReg = IdxReg; + } - BaseReg = MIRBuilder.buildPtrAdd(PtrTy, BaseReg, GepOffsetReg).getReg(0); + BaseReg = + MIRBuilder.buildPtrAdd(PtrTy, BaseReg, GepOffsetReg, PtrAddFlags) + .getReg(0); } } @@ -1681,11 +1704,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, auto OffsetMIB = MIRBuilder.buildConstant(OffsetTy, Offset); - if (Offset >= 0 && cast<GEPOperator>(U).isInBounds()) - Flags |= MachineInstr::MIFlag::NoUWrap; - MIRBuilder.buildPtrAdd(getOrCreateVReg(U), BaseReg, OffsetMIB.getReg(0), - Flags); + PtrAddFlagsWithConst(Offset)); return true; } @@ -2189,8 +2209,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END; - const AllocaInst *AI = cast<AllocaInst>(CI.getArgOperand(1)); - if (!AI->isStaticAlloca()) + const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1)); + if (!AI || !AI->isStaticAlloca()) return true; MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI)); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index f48bfc0..8955dd0 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1401,6 +1401,21 @@ bool llvm::isBuildVectorConstantSplat(const Register Reg, return false; } +bool llvm::isBuildVectorConstantSplat(const Register Reg, + const MachineRegisterInfo &MRI, + APInt SplatValue, bool AllowUndef) { + if (auto SplatValAndReg = getAnyConstantSplat(Reg, MRI, AllowUndef)) { + if (SplatValAndReg->Value.getBitWidth() < SplatValue.getBitWidth()) + return APInt::isSameValue( + SplatValAndReg->Value.sext(SplatValue.getBitWidth()), SplatValue); + return APInt::isSameValue( + SplatValAndReg->Value, + SplatValue.sext(SplatValAndReg->Value.getBitWidth())); + } + + return false; +} + bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, int64_t SplatValue, bool AllowUndef) { @@ -1408,6 +1423,13 @@ bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI, AllowUndef); } +bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + APInt SplatValue, bool AllowUndef) { + return isBuildVectorConstantSplat(MI.getOperand(0).getReg(), MRI, SplatValue, + AllowUndef); +} + std::optional<APInt> llvm::getIConstantSplatVal(const Register Reg, const MachineRegisterInfo &MRI) { if (auto SplatValAndReg = diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 7ede564..514f2f0 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -306,12 +306,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { /// number if it is not zero. If DstReg is a physical register and the /// existing subregister number of the def / use being updated is not zero, /// make sure to set it to the correct physical subregister. - /// - /// If \p SubregToRegSrcInst is not empty, we are coalescing a - /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an - /// implicit-def of DstReg on instructions that define SrcReg. - void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx, - ArrayRef<MachineInstr *> SubregToRegSrcInst = {}); + void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx); /// If the given machine operand reads only undefined lanes add an undef /// flag. @@ -1448,7 +1443,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // CopyMI may have implicit operands, save them so that we can transfer them // over to the newly materialized instruction after CopyMI is removed. - LaneBitmask NewMIImplicitOpsMask; SmallVector<MachineOperand, 4> ImplicitOps; ImplicitOps.reserve(CopyMI->getNumOperands() - CopyMI->getDesc().getNumOperands()); @@ -1463,9 +1457,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) && "unexpected implicit virtual register def"); ImplicitOps.push_back(MO); - if (MO.isDef() && MO.getReg().isVirtual() && - MRI->shouldTrackSubRegLiveness(DstReg)) - NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } @@ -1508,11 +1499,14 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, } else { assert(MO.getReg() == NewMI.getOperand(0).getReg()); - // If lanemasks need to be tracked, compile the lanemask of the NewMI - // implicit def operands to avoid subranges for the super-regs from - // being removed by code later on in this function. - if (MRI->shouldTrackSubRegLiveness(MO.getReg())) - NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); + // We're only expecting another def of the main output, so the range + // should get updated with the regular output range. + // + // FIXME: The range updating below probably needs updating to look at + // the super register if subranges are tracked. + assert(!MRI->shouldTrackSubRegLiveness(DstReg) && + "subrange update for implicit-def of super register may not be " + "properly handled"); } } } @@ -1612,8 +1606,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); for (LiveInterval::SubRange &SR : DstInt.subranges()) { - if ((SR.LaneMask & DstMask).none() && - (SR.LaneMask & NewMIImplicitOpsMask).none()) { + if ((SR.LaneMask & DstMask).none()) { LLVM_DEBUG(dbgs() << "Removing undefined SubRange " << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); @@ -1631,11 +1624,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, UpdatedSubRanges = true; } else { // We know that this lane is defined by this instruction, - // but at this point it may be empty because it is not used by - // anything. This happens when updateRegDefUses adds the missing - // lanes. Assign that lane a dead def so that the interferences - // are properly modeled. - if (SR.empty()) + // but at this point it might not be live because it was not defined + // by the original instruction. This happens when the + // rematerialization widens the defined register. Assign that lane a + // dead def so that the interferences are properly modeled. + if (!SR.liveAt(DefIndex)) SR.createDeadDef(DefIndex, Alloc); } } @@ -1877,14 +1870,11 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx, } } -void RegisterCoalescer::updateRegDefsUses( - Register SrcReg, Register DstReg, unsigned SubIdx, - ArrayRef<MachineInstr *> SubregToRegSrcInsts) { +void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, + unsigned SubIdx) { bool DstIsPhys = DstReg.isPhysical(); LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg); - // Coalescing a COPY may expose reads of 'undef' subregisters. - // If so, then explicitly propagate 'undef' to those operands. if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) { for (MachineOperand &MO : MRI->reg_operands(DstReg)) { if (MO.isUndef()) @@ -1901,15 +1891,6 @@ void RegisterCoalescer::updateRegDefsUses( } } - // If DstInt already has a subrange for the unused lanes, then we shouldn't - // create duplicate subranges when we update the interval for unused lanes. - LaneBitmask DstIntLaneMask; - if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { - for (LiveInterval::SubRange &SR : DstInt->subranges()) - DstIntLaneMask |= SR.LaneMask; - } - - // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'. SmallPtrSet<MachineInstr *, 8> Visited; for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end(); @@ -1933,80 +1914,6 @@ void RegisterCoalescer::updateRegDefsUses( if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); - bool RequiresImplicitRedef = false; - if (!SubregToRegSrcInsts.empty()) { - // We can only add an implicit-def and undef if the sub registers match, - // e.g. - // %0:gr32 = INSTX - // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined - // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32 - // - // This cannot be transformed into: - // %1.sub32:gr64 = INSTX - // undef %1.sub8:gr64 = INSTY , implicit-def %1 - // - // Because that would thrash the top 24 bits of %1.sub32. - if (is_contained(SubregToRegSrcInsts, UseMI) && - all_of(UseMI->defs(), - [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool { - if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef()) - return true; - return SubIdx == MO.getSubReg(); - })) { - // Add implicit-def of super-register to express that the whole - // register is defined by the instruction. - MachineInstrBuilder MIB(*MF, UseMI); - MIB.addReg(DstReg, RegState::ImplicitDefine); - RequiresImplicitRedef = true; - } - - // If the coalesed instruction doesn't fully define the register, we need - // to preserve the original super register liveness for SUBREG_TO_REG. - // - // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes, - // but it introduces liveness for other subregisters. Downstream users may - // have been relying on those bits, so we need to ensure their liveness is - // captured with a def of other lanes. - if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { - // First check if there is sufficient granularity in terms of subranges. - LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg()); - LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx); - LaneBitmask UnusedLanes = DstMask & ~UsedLanes; - if ((UnusedLanes & ~DstIntLaneMask).any()) { - BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt); - DstIntLaneMask |= UnusedLanes; - } - - // After duplicating the live ranges for the low/hi bits, we - // need to update the subranges of the DstReg interval such that - // for a case like this: - // - // entry: - // 16B %1:gpr32 = INSTRUCTION (<=> UseMI) - // : - // if.then: - // 32B %1:gpr32 = MOVIMM32 .. - // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32 - // - // Only the MOVIMM32 require a def of the top lanes and any intervals - // for the top 32-bits of the def at 16B should be removed. - for (LiveInterval::SubRange &SR : DstInt->subranges()) { - if (!Writes || RequiresImplicitRedef || - (SR.LaneMask & UnusedLanes).none()) - continue; - - assert((SR.LaneMask & UnusedLanes) == SR.LaneMask && - "Unexpected lanemask. Subrange needs finer granularity"); - - SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false); - auto SegmentI = SR.find(UseIdx); - if (SegmentI != SR.end()) - SR.removeSegment(SegmentI, true); - } - } - } - // Replace SrcReg with DstReg in all UseMI operands. for (unsigned Op : Ops) { MachineOperand &MO = UseMI->getOperand(Op); @@ -2015,7 +1922,7 @@ void RegisterCoalescer::updateRegDefsUses( // turn a full def into a read-modify-write sub-register def and vice // versa. if (SubIdx && MO.isDef()) - MO.setIsUndef(!Reads || RequiresImplicitRedef); + MO.setIsUndef(!Reads); // A subreg use of a partially undef (super) register may be a complete // undef use now and then has to be marked that way. @@ -2118,30 +2025,6 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } -/// For a given use of value \p Idx, it returns the def in the current block, -/// or otherwise all possible defs in preceding blocks. -static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks, - SmallVector<MachineInstr *> &Instrs, - LiveIntervals *LIS, LiveInterval &SrcInt, - MachineBasicBlock *MBB, VNInfo *Idx) { - if (!Idx->isPHIDef()) { - MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def); - assert(Def && "Unable to find a def for SUBREG_TO_REG source operand"); - Instrs.push_back(Def); - return true; - } - - bool Any = false; - if (VisitedBlocks.count(MBB)) - return false; - VisitedBlocks.insert(MBB); - for (MachineBasicBlock *Pred : MBB->predecessors()) { - Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred, - SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred))); - } - return Any; -} - bool RegisterCoalescer::joinCopy( MachineInstr *CopyMI, bool &Again, SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) { @@ -2273,35 +2156,6 @@ bool RegisterCoalescer::joinCopy( }); } - SmallVector<MachineInstr *> SubregToRegSrcInsts; - if (CopyMI->isSubregToReg()) { - // For the case where the copy instruction is a SUBREG_TO_REG, e.g. - // - // %0:gpr32 = movimm32 .. - // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32 - // ... - // %0:gpr32 = COPY <something> - // - // After joining liveranges, the original `movimm32` will need an - // implicit-def to make it explicit that the entire register is written, - // i.e. - // - // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0 - // ... - // undef %0.sub32:gpr64 = COPY <something> // Note that this does not - // // require an implicit-def, - // // because it has nothing to - // // do with the SUBREG_TO_REG. - LiveInterval &SrcInt = - LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); - SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI); - SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks; - if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt, - CopyMI->getParent(), - SrcInt.Query(SubregToRegSlotIdx).valueIn())) - llvm_unreachable("SUBREG_TO_REG src requires a def"); - } - ShrinkMask = LaneBitmask::getNone(); ShrinkMainRange = false; @@ -2371,12 +2225,9 @@ bool RegisterCoalescer::joinCopy( // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. - if (CP.getDstIdx()) { - assert(SubregToRegSrcInsts.empty() && "can this happen?"); + if (CP.getDstIdx()) updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx()); - } - updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(), - SubregToRegSrcInsts); + updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx()); // Shrink subregister ranges if necessary. if (ShrinkMask.any()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 11e869a..d70e96938 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4075,18 +4075,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { unsigned BitWidth = VT.getScalarSizeInBits(); SDLoc DL(N); - auto PeekThroughFreeze = [](SDValue N) { - if (N->getOpcode() == ISD::FREEZE && N.hasOneUse()) - return N->getOperand(0); - return N; - }; - if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG)) return V; // fold (sub x, x) -> 0 - // FIXME: Refactor this and xor and other similar operations together. - if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1)) + if (N0 == N1) return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); // fold (sub c1, c2) -> c3 @@ -6499,19 +6492,21 @@ static unsigned getMinMaxOpcodeForFP(SDValue Operand1, SDValue Operand2, // It is safe to use FMINNUM_IEEE/FMAXNUM_IEEE if all the operands // are non NaN values. if (((CC == ISD::SETLT || CC == ISD::SETLE) && (OrAndOpcode == ISD::OR)) || - ((CC == ISD::SETGT || CC == ISD::SETGE) && (OrAndOpcode == ISD::AND))) + ((CC == ISD::SETGT || CC == ISD::SETGE) && (OrAndOpcode == ISD::AND))) { return arebothOperandsNotNan(Operand1, Operand2, DAG) && isFMAXNUMFMINNUM_IEEE ? ISD::FMINNUM_IEEE : ISD::DELETED_NODE; - else if (((CC == ISD::SETGT || CC == ISD::SETGE) && - (OrAndOpcode == ISD::OR)) || - ((CC == ISD::SETLT || CC == ISD::SETLE) && - (OrAndOpcode == ISD::AND))) + } + + if (((CC == ISD::SETGT || CC == ISD::SETGE) && (OrAndOpcode == ISD::OR)) || + ((CC == ISD::SETLT || CC == ISD::SETLE) && (OrAndOpcode == ISD::AND))) { return arebothOperandsNotNan(Operand1, Operand2, DAG) && isFMAXNUMFMINNUM_IEEE ? ISD::FMAXNUM_IEEE : ISD::DELETED_NODE; + } + // Both FMINNUM/FMAXNUM and FMINNUM_IEEE/FMAXNUM_IEEE handle quiet // NaNs in the same way. But, FMINNUM/FMAXNUM and FMINNUM_IEEE/ // FMAXNUM_IEEE handle signaling NaNs differently. If we cannot prove @@ -6521,24 +6516,24 @@ static unsigned getMinMaxOpcodeForFP(SDValue Operand1, SDValue Operand2, // we can prove that we do not have any sNaNs, then we can do the // optimization using FMINNUM_IEEE/FMAXNUM_IEEE for the following // cases. - else if (((CC == ISD::SETOLT || CC == ISD::SETOLE) && - (OrAndOpcode == ISD::OR)) || - ((CC == ISD::SETUGT || CC == ISD::SETUGE) && - (OrAndOpcode == ISD::AND))) + if (((CC == ISD::SETOLT || CC == ISD::SETOLE) && (OrAndOpcode == ISD::OR)) || + ((CC == ISD::SETUGT || CC == ISD::SETUGE) && (OrAndOpcode == ISD::AND))) { return isFMAXNUMFMINNUM ? ISD::FMINNUM - : arebothOperandsNotSNan(Operand1, Operand2, DAG) && - isFMAXNUMFMINNUM_IEEE - ? ISD::FMINNUM_IEEE - : ISD::DELETED_NODE; - else if (((CC == ISD::SETOGT || CC == ISD::SETOGE) && - (OrAndOpcode == ISD::OR)) || - ((CC == ISD::SETULT || CC == ISD::SETULE) && - (OrAndOpcode == ISD::AND))) + : arebothOperandsNotSNan(Operand1, Operand2, DAG) && + isFMAXNUMFMINNUM_IEEE + ? ISD::FMINNUM_IEEE + : ISD::DELETED_NODE; + } + + if (((CC == ISD::SETOGT || CC == ISD::SETOGE) && (OrAndOpcode == ISD::OR)) || + ((CC == ISD::SETULT || CC == ISD::SETULE) && (OrAndOpcode == ISD::AND))) { return isFMAXNUMFMINNUM ? ISD::FMAXNUM - : arebothOperandsNotSNan(Operand1, Operand2, DAG) && - isFMAXNUMFMINNUM_IEEE - ? ISD::FMAXNUM_IEEE - : ISD::DELETED_NODE; + : arebothOperandsNotSNan(Operand1, Operand2, DAG) && + isFMAXNUMFMINNUM_IEEE + ? ISD::FMAXNUM_IEEE + : ISD::DELETED_NODE; + } + return ISD::DELETED_NODE; } @@ -13184,14 +13179,14 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal, // select Cond, -1, x → or Cond, x if (IsTAllOne) { - SDValue X = DAG.getBitcast(CondVT, FVal); + SDValue X = DAG.getBitcast(CondVT, DAG.getFreeze(FVal)); SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, X); return DAG.getBitcast(VT, Or); } // select Cond, x, 0 → and Cond, x if (IsFAllZero) { - SDValue X = DAG.getBitcast(CondVT, TVal); + SDValue X = DAG.getBitcast(CondVT, DAG.getFreeze(TVal)); SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, X); return DAG.getBitcast(VT, And); } @@ -13199,7 +13194,7 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal, // select Cond, 0, x -> and not(Cond), x if (IsTAllZero && (isBitwiseNot(peekThroughBitcasts(Cond)) || TLI.hasAndNot(Cond))) { - SDValue X = DAG.getBitcast(CondVT, FVal); + SDValue X = DAG.getBitcast(CondVT, DAG.getFreeze(FVal)); SDValue And = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT), X); return DAG.getBitcast(VT, And); @@ -16754,6 +16749,17 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false)) return N0; + // If we have frozen and unfrozen users of N0, update so everything uses N. + if (!N0.isUndef() && !N0.hasOneUse()) { + SDValue FrozenN0(N, 0); + DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0); + // ReplaceAllUsesOfValueWith will have also updated the use in N, thus + // creating a cycle in a DAG. Let's undo that by mutating the freeze. + assert(N->getOperand(0) == FrozenN0 && "Expected cycle in DAG"); + DAG.UpdateNodeOperands(N, N0); + return FrozenN0; + } + // We currently avoid folding freeze over SRA/SRL, due to the problems seen // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for // example https://reviews.llvm.org/D136529#4120959. @@ -16807,8 +16813,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { SmallSet<SDValue, 8> MaybePoisonOperands; SmallVector<unsigned, 8> MaybePoisonOperandNumbers; for (auto [OpNo, Op] : enumerate(N0->ops())) { - if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false, - /*Depth*/ 1)) + if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly=*/false)) continue; bool HadMaybePoisonOperands = !MaybePoisonOperands.empty(); bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second; @@ -22534,6 +22539,56 @@ SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) { return SDValue(); } +static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, + const SDLoc &Dl) { + if (!Store->isSimple() || !ISD::isNormalStore(Store)) + return SDValue(); + + SDValue StoredVal = Store->getValue(); + SDValue StorePtr = Store->getBasePtr(); + SDValue StoreOffset = Store->getOffset(); + EVT VT = Store->getMemoryVT(); + unsigned AddrSpace = Store->getAddressSpace(); + Align Alignment = Store->getAlign(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isOperationLegalOrCustom(ISD::MSTORE, VT) || + !TLI.allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment)) + return SDValue(); + + SDValue Mask, OtherVec, LoadCh; + unsigned LoadPos; + if (sd_match(StoredVal, + m_VSelect(m_Value(Mask), m_Value(OtherVec), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset))))) { + LoadPos = 2; + } else if (sd_match(StoredVal, + m_VSelect(m_Value(Mask), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset)), + m_Value(OtherVec)))) { + LoadPos = 1; + } else { + return SDValue(); + } + + auto *Load = cast<LoadSDNode>(StoredVal.getOperand(LoadPos)); + if (!Load->isSimple() || !ISD::isNormalLoad(Load) || + Load->getAddressSpace() != AddrSpace) + return SDValue(); + + if (!Store->getChain().reachesChainWithoutSideEffects(LoadCh)) + return SDValue(); + + if (LoadPos == 1) + Mask = DAG.getNOT(Dl, Mask, Mask.getValueType()); + + return DAG.getMaskedStore(Store->getChain(), Dl, OtherVec, StorePtr, + StoreOffset, Mask, VT, Store->getMemOperand(), + Store->getAddressingMode()); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Chain = ST->getChain(); @@ -22768,6 +22823,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewSt = splitMergedValStore(ST)) return NewSt; + if (SDValue MaskedStore = foldToMaskedStore(ST, DAG, SDLoc(N))) + return MaskedStore; + return ReduceLoadOpStoreWidth(N); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 583a85a..a5bd97a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2217,8 +2217,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { switch (getTypeAction(InVT)) { case TargetLowering::TypePromoteInteger: { - // TODO: Handle big endian - if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) { + // TODO: Handle big endian & vector input type. + if (OutVT.isVector() && !InVT.isVector() && + DAG.getDataLayout().isLittleEndian()) { EVT EltVT = OutVT.getVectorElementType(); TypeSize EltSize = EltVT.getSizeInBits(); TypeSize NInSize = NInVT.getSizeInBits(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index f41b6eb..61f1144 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6351,8 +6351,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::FREEZE: assert(VT == N1.getValueType() && "Unexpected VT!"); - if (isGuaranteedNotToBeUndefOrPoison(N1, /*PoisonOnly*/ false, - /*Depth*/ 1)) + if (isGuaranteedNotToBeUndefOrPoison(N1, /*PoisonOnly=*/false)) return N1; break; case ISD::TokenFactor: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 306e068..ac0440f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7598,7 +7598,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1)); + const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1)); + if (!LifetimeObject) + return; // First check that the Alloca is static, otherwise it won't have a // valid frame index. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 48d6b99..a68f521 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -775,13 +775,6 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( break; } - case ISD::FREEZE: { - SDValue N0 = Op.getOperand(0); - if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false, Depth + 1)) - return N0; - break; - } case ISD::AND: { LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); @@ -5363,10 +5356,25 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (AndRHSC.isNegatedPowerOf2() && C1.isSubsetOf(AndRHSC)) { unsigned ShiftBits = AndRHSC.countr_zero(); if (!shouldAvoidTransformToShift(ShValTy, ShiftBits)) { + // If using an unsigned shift doesn't yield a legal compare + // immediate, try using sra instead. + APInt NewC = C1.lshr(ShiftBits); + if (NewC.getSignificantBits() <= 64 && + !isLegalICmpImmediate(NewC.getSExtValue())) { + APInt SignedC = C1.ashr(ShiftBits); + if (SignedC.getSignificantBits() <= 64 && + isLegalICmpImmediate(SignedC.getSExtValue())) { + SDValue Shift = DAG.getNode( + ISD::SRA, dl, ShValTy, N0.getOperand(0), + DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl)); + SDValue CmpRHS = DAG.getConstant(SignedC, dl, ShValTy); + return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond); + } + } SDValue Shift = DAG.getNode( ISD::SRL, dl, ShValTy, N0.getOperand(0), DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl)); - SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, ShValTy); + SDValue CmpRHS = DAG.getConstant(NewC, dl, ShValTy); return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond); } } @@ -6482,8 +6490,8 @@ SDValue TargetLowering::buildSDIVPow2WithCMov( Created.push_back(CMov.getNode()); // Divide by pow2. - SDValue SRA = - DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, VT)); + SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CMov, + DAG.getShiftAmountConstant(Lg2, VT, DL)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 705e046e..9e49ddd 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -993,7 +993,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). if (!MI || MI->getParent() != &MBB || - ((unsigned)MI->getOpcode() != CombineOpc && CombineOpc != 0)) + (MI->getOpcode() != CombineOpc && CombineOpc != 0)) return false; // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3c91b0e..9f525ea 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -697,7 +697,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) MaxGluedStoresPerMemcpy = 0; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; - HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; @@ -905,6 +904,8 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); setOperationAction(ISD::RESET_FPENV, VT, Expand); + + setOperationAction(ISD::MSTORE, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index e9172f4..d19ef92 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -402,8 +402,8 @@ void TargetLoweringObjectFileELF::emitPersonalityValue( const MachineModuleInfo *MMI) const { SmallString<64> NameData("DW.ref."); NameData += Sym->getName(); - MCSymbolELF *Label = - cast<MCSymbolELF>(getContext().getOrCreateSymbol(NameData)); + auto *Label = + static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(NameData)); Streamer.emitSymbolAttribute(Label, MCSA_Hidden); Streamer.emitSymbolAttribute(Label, MCSA_Weak); unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP; @@ -581,7 +581,8 @@ static const MCSymbolELF *getLinkedToSymbol(const GlobalObject *GO, auto *VM = cast<ValueAsMetadata>(MD->getOperand(0).get()); auto *OtherGV = dyn_cast<GlobalValue>(VM->getValue()); - return OtherGV ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGV)) : nullptr; + return OtherGV ? static_cast<const MCSymbolELF *>(TM.getSymbol(OtherGV)) + : nullptr; } static unsigned getEntrySizeForKind(SectionKind Kind) { @@ -1011,7 +1012,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForLSDA( (getContext().getAsmInfo()->useIntegratedAssembler() && getContext().getAsmInfo()->binutilsIsAtLeast(2, 36))) { Flags |= ELF::SHF_LINK_ORDER; - LinkedToSym = cast<MCSymbolELF>(&FnSym); + LinkedToSym = static_cast<const MCSymbolELF *>(&FnSym); } // Append the function name as the suffix like GCC, assuming @@ -2370,9 +2371,10 @@ bool TargetLoweringObjectFileXCOFF::ShouldSetSSPCanaryBitInTB( MCSymbol * TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(const MachineFunction *MF) { - MCSymbol *EHInfoSym = MF->getContext().getOrCreateSymbol( - "__ehinfo." + Twine(MF->getFunctionNumber())); - cast<MCSymbolXCOFF>(EHInfoSym)->setEHInfo(); + auto *EHInfoSym = + static_cast<MCSymbolXCOFF *>(MF->getContext().getOrCreateSymbol( + "__ehinfo." + Twine(MF->getFunctionNumber()))); + EHInfoSym->setEHInfo(); return EHInfoSym; } @@ -2510,7 +2512,8 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal( if (Kind.isText()) { if (TM.getFunctionSections()) { - return cast<MCSymbolXCOFF>(getFunctionEntryPointSymbol(GO, TM)) + return static_cast<const MCSymbolXCOFF *>( + getFunctionEntryPointSymbol(GO, TM)) ->getRepresentedCsect(); } return TextSection; @@ -2713,7 +2716,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry( const MCSymbol *Sym, const TargetMachine &TM) const { const XCOFF::StorageMappingClass SMC = [](const MCSymbol *Sym, const TargetMachine &TM) { - const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(Sym); + auto *XSym = static_cast<const MCSymbolXCOFF *>(Sym); // The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC, // otherwise the AIX assembler will complain. @@ -2737,8 +2740,8 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry( }(Sym, TM); return getContext().getXCOFFSection( - cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), SectionKind::getData(), - XCOFF::CsectProperties(SMC, XCOFF::XTY_SD)); + static_cast<const MCSymbolXCOFF *>(Sym)->getSymbolTableName(), + SectionKind::getData(), XCOFF::CsectProperties(SMC, XCOFF::XTY_SD)); } MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA( diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index 6d0a94d..73df62a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -2260,7 +2260,7 @@ public: continue; } - if (Section.relocation_begin() == Section.relocation_end()) + if (Section.relocations().empty()) continue; // Symbol to [address, section index] cache mapping. diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp index 308dac4..09ac0f1 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp @@ -261,7 +261,7 @@ private: // Skip relocations virtual sections. if (S.isVirtual()) { - if (S.relocation_begin() != S.relocation_end()) + if (!S.relocations().empty()) return make_error<JITLinkError>("Virtual section contains " "relocations"); continue; diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index bb5f3ab..27209a8 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -236,7 +236,7 @@ private: // Skip relocations virtual sections. if (S.isVirtual()) { - if (S.relocation_begin() != S.relocation_end()) + if (!S.relocations().empty()) return make_error<JITLinkError>("Virtual section contains " "relocations"); continue; diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index a8559e7..6de6cc7 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -891,7 +891,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, // Align DataSize to stub alignment if we have any stubs (PaddingSize will // have been increased above to account for this). if (StubBufSize > 0) - DataSize &= -(uint64_t)getStubAlignment().value(); + DataSize &= -getStubAlignment().value(); } LLVM_DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index ffc7696..08d6c78 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -2822,7 +2822,7 @@ Error RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj, // object's sections to GOTs. for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end(); SI != SE; ++SI) { - if (SI->relocation_begin() != SI->relocation_end()) { + if (!SI->relocations().empty()) { Expected<section_iterator> RelSecOrErr = SI->getRelocatedSection(); if (!RelSecOrErr) return make_error<RuntimeDyldError>( diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 48ff1ca..6d89fa7 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -270,11 +270,11 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD, Error MetadataParser::parseRootDescriptors( mcdxbc::RootSignatureDesc &RSD, MDNode *RootDescriptorNode, RootSignatureElementKind ElementKind) { - assert(ElementKind == RootSignatureElementKind::SRV || - ElementKind == RootSignatureElementKind::UAV || - ElementKind == RootSignatureElementKind::CBV && - "parseRootDescriptors should only be called with RootDescriptor " - "element kind."); + assert((ElementKind == RootSignatureElementKind::SRV || + ElementKind == RootSignatureElementKind::UAV || + ElementKind == RootSignatureElementKind::CBV) && + "parseRootDescriptors should only be called with RootDescriptor " + "element kind."); if (RootDescriptorNode->getNumOperands() != 5) return make_error<InvalidRSMetadataFormat>("Root Descriptor Element"); diff --git a/llvm/lib/Frontend/Offloading/CMakeLists.txt b/llvm/lib/Frontend/Offloading/CMakeLists.txt index 8e1ede9..9747dbd 100644 --- a/llvm/lib/Frontend/Offloading/CMakeLists.txt +++ b/llvm/lib/Frontend/Offloading/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMFrontendOffloading Utility.cpp OffloadWrapper.cpp + PropertySet.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend diff --git a/llvm/lib/Frontend/Offloading/PropertySet.cpp b/llvm/lib/Frontend/Offloading/PropertySet.cpp new file mode 100644 index 0000000..a70290d --- /dev/null +++ b/llvm/lib/Frontend/Offloading/PropertySet.cpp @@ -0,0 +1,102 @@ +///===- llvm/Frontend/Offloading/PropertySet.cpp --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/Offloading/PropertySet.h" +#include "llvm/Support/Base64.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBufferRef.h" + +using namespace llvm; +using namespace llvm::offloading; + +void llvm::offloading::writePropertiesToJSON( + const PropertySetRegistry &PSRegistry, raw_ostream &Out) { + json::OStream J(Out); + J.object([&] { + for (const auto &[CategoryName, PropSet] : PSRegistry) { + auto PropSetCapture = PropSet; + J.attributeObject(CategoryName, [&] { + for (const auto &[PropName, PropVal] : PropSetCapture) { + switch (PropVal.index()) { + case 0: + J.attribute(PropName, std::get<uint32_t>(PropVal)); + break; + case 1: + J.attribute(PropName, encodeBase64(std::get<ByteArray>(PropVal))); + break; + default: + llvm_unreachable("unsupported property type"); + } + } + }); + } + }); +} + +// note: createStringError has an overload that takes a format string, +// but it uses llvm::format instead of llvm::formatv, which does +// not work with json::Value. This is a helper function to use +// llvm::formatv with createStringError. +template <typename... Ts> auto createStringErrorV(Ts &&...Args) { + return createStringError(formatv(std::forward<Ts>(Args)...)); +} + +Expected<PropertyValue> +readPropertyValueFromJSON(const json::Value &PropValueVal) { + if (std::optional<uint64_t> Val = PropValueVal.getAsUINT64()) + return PropertyValue(static_cast<uint32_t>(*Val)); + + if (std::optional<StringRef> Val = PropValueVal.getAsString()) { + std::vector<char> Decoded; + if (Error E = decodeBase64(*Val, Decoded)) + return createStringErrorV("unable to base64 decode the string {0}: {1}", + Val, toString(std::move(E))); + return PropertyValue(ByteArray(Decoded.begin(), Decoded.end())); + } + + return createStringErrorV("expected a uint64 or a string, got {0}", + PropValueVal); +} + +Expected<PropertySetRegistry> +llvm::offloading::readPropertiesFromJSON(MemoryBufferRef Buf) { + PropertySetRegistry Res; + Expected<json::Value> V = json::parse(Buf.getBuffer()); + if (Error E = V.takeError()) + return E; + + const json::Object *O = V->getAsObject(); + if (!O) + return createStringErrorV( + "error while deserializing property set registry: " + "expected JSON object, got {0}", + *V); + + for (const auto &[CategoryName, Value] : *O) { + const json::Object *PropSetVal = Value.getAsObject(); + if (!PropSetVal) + return createStringErrorV("error while deserializing property set {0}: " + "expected JSON array, got {1}", + CategoryName.str(), Value); + + PropertySet &PropSet = Res[CategoryName.str()]; + for (const auto &[PropName, PropValueVal] : *PropSetVal) { + Expected<PropertyValue> Prop = readPropertyValueFromJSON(PropValueVal); + if (Error E = Prop.takeError()) + return createStringErrorV( + "error while deserializing property {0} in property set {1}: {2}", + PropName.str(), CategoryName.str(), toString(std::move(E))); + + auto [It, Inserted] = + PropSet.try_emplace(PropName.str(), std::move(*Prop)); + assert(Inserted && "Property already exists in PropertySet"); + (void)Inserted; + } + } + return Res; +} diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index e5a4e1e..dc6d599 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1163,7 +1163,7 @@ int SlotTracker::processIndex() { std::vector<StringRef> ModulePaths; for (auto &[ModPath, _] : TheIndex->modulePaths()) ModulePaths.push_back(ModPath); - llvm::sort(ModulePaths.begin(), ModulePaths.end()); + llvm::sort(ModulePaths); for (auto &ModPath : ModulePaths) CreateModulePathSlot(ModPath); diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 763cc18..b7cd12a 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -942,14 +942,13 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I, // We have two instructions of identical opcode and #operands. Check to see // if all operands are the same. - if (!std::equal(op_begin(), op_end(), I->op_begin())) + if (!equal(operands(), I->operands())) return false; // WARNING: this logic must be kept in sync with EliminateDuplicatePHINodes()! - if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) { - const PHINode *otherPHI = cast<PHINode>(I); - return std::equal(thisPHI->block_begin(), thisPHI->block_end(), - otherPHI->block_begin()); + if (const PHINode *Phi = dyn_cast<PHINode>(this)) { + const PHINode *OtherPhi = cast<PHINode>(I); + return equal(Phi->blocks(), OtherPhi->blocks()); } return this->hasSameSpecialState(I, /*IgnoreAlignment=*/false, diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp index 010bd15..ca6a480 100644 --- a/llvm/lib/IR/Mangler.cpp +++ b/llvm/lib/IR/Mangler.cpp @@ -292,6 +292,9 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV, } std::optional<std::string> llvm::getArm64ECMangledFunctionName(StringRef Name) { + assert(!Name.empty() && + "getArm64ECMangledFunctionName requires non-empty name"); + if (Name[0] != '?') { // For non-C++ symbols, prefix the name with "#" unless it's already // mangled. diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 5936ac7..a8e6c79 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -8,6 +8,9 @@ #include "llvm/IR/RuntimeLibcalls.h" #include "llvm/ADT/StringTable.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "runtime-libcalls-info" using namespace llvm; using namespace RTLIB; @@ -44,11 +47,9 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, RTLIB::__aeabi_ui2f, RTLIB::__aeabi_l2f, RTLIB::__aeabi_ul2f, RTLIB::__aeabi_lmul, RTLIB::__aeabi_llsl, RTLIB::__aeabi_llsr, - RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv__i8, - RTLIB::__aeabi_idiv__i16, RTLIB::__aeabi_idiv__i32, + RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv, RTLIB::__aeabi_idivmod, RTLIB::__aeabi_uidivmod, - RTLIB::__aeabi_ldivmod, RTLIB::__aeabi_uidiv__i8, - RTLIB::__aeabi_uidiv__i16, RTLIB::__aeabi_uidiv__i32, + RTLIB::__aeabi_ldivmod, RTLIB::__aeabi_uidiv, RTLIB::__aeabi_uldivmod, RTLIB::__aeabi_f2h, RTLIB::__aeabi_d2h, RTLIB::__aeabi_h2f, RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove, @@ -62,12 +63,6 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS); } -void RTLIB::RuntimeLibcallsInfo::initDefaultLibCallImpls() { - std::memcpy(LibcallImpls, DefaultLibcallImpls, sizeof(LibcallImpls)); - static_assert(sizeof(LibcallImpls) == sizeof(DefaultLibcallImpls), - "libcall array size should match"); -} - /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, @@ -76,59 +71,14 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, EABI EABIVersion, StringRef ABIName) { setTargetRuntimeLibcallSets(TT, FloatABI); - // Early exit for targets that have fully ported to tablegen. - if (TT.isAMDGPU() || TT.isNVPTX() || TT.isWasm()) - return; - - if (TT.isX86() || TT.isVE() || TT.isARM() || TT.isThumb()) { - if (ExceptionModel == ExceptionHandling::SjLj) - setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume); - } - - // A few names are different on particular architectures or environments. - if (TT.isOSDarwin()) { - // For f16/f32 conversions, Darwin uses the standard naming scheme, - // instead of the gnueabi-style __gnu_*_ieee. - // FIXME: What about other targets? - setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__extendhfsf2); - setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__truncsfhf2); - - if (!darwinHasExp10(TT)) { - setLibcallImpl(RTLIB::EXP10_F32, RTLIB::Unsupported); - setLibcallImpl(RTLIB::EXP10_F64, RTLIB::Unsupported); - } - } - - if (TT.isOSOpenBSD()) { - setLibcallImpl(RTLIB::STACKPROTECTOR_CHECK_FAIL, RTLIB::Unsupported); - setLibcallImpl(RTLIB::STACK_SMASH_HANDLER, RTLIB::__stack_smash_handler); - } - - // Skip default manual processing for targets that have been fully ported to - // tablegen for now. Eventually the rest of this should be deleted. - if (TT.isX86() || TT.isAArch64() || TT.isWasm()) - return; + if (ExceptionModel == ExceptionHandling::SjLj) + setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume); if (TT.isARM() || TT.isThumb()) { setARMLibcallNames(*this, TT, FloatABI, EABIVersion); return; } - if (hasSinCos(TT)) { - setLibcallImpl(RTLIB::SINCOS_F32, RTLIB::sincosf); - setLibcallImpl(RTLIB::SINCOS_F64, RTLIB::sincos); - setLibcallImpl(RTLIB::SINCOS_F128, RTLIB::sincos_f128); - } - - // These libcalls are only available in compiler-rt, not libgcc. - if (TT.isArch64Bit()) { - setLibcallImpl(RTLIB::SHL_I128, RTLIB::__ashlti3); - setLibcallImpl(RTLIB::SRL_I128, RTLIB::__lshrti3); - setLibcallImpl(RTLIB::SRA_I128, RTLIB::__ashrti3); - setLibcallImpl(RTLIB::MUL_I128, RTLIB::__multi3); - setLibcallImpl(RTLIB::MULO_I64, RTLIB::__mulodi4); - } - if (TT.getArch() == Triple::ArchType::msp430) { setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, CallingConv::MSP430_BUILTIN); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 3ff9895..ca3f148 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6769,10 +6769,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - Check(isa<AllocaInst>(Call.getArgOperand(1)), - "llvm.lifetime.start/end can only be used on alloca", &Call); + case Intrinsic::lifetime_end: { + Value *Ptr = Call.getArgOperand(1); + Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr), + "llvm.lifetime.start/end can only be used on alloca or poison", + &Call); break; + } }; // Verify that there aren't any unmediated control transfers between funclets. diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index ae8dffc..8f3814a 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -401,16 +401,15 @@ static bool isIFunc(const MCSymbolELF *Symbol) { mergeTypeForSet(Symbol->getType(), ELF::STT_GNU_IFUNC) != ELF::STT_GNU_IFUNC) return false; - Symbol = &cast<MCSymbolELF>(Value->getSymbol()); + Symbol = &static_cast<const MCSymbolELF &>(Value->getSymbol()); } return true; } void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex, ELFSymbolData &MSD) { - const auto &Symbol = cast<MCSymbolELF>(*MSD.Symbol); - const MCSymbolELF *Base = - cast_or_null<MCSymbolELF>(Asm.getBaseSymbol(Symbol)); + auto &Symbol = static_cast<const MCSymbolELF &>(*MSD.Symbol); + auto *Base = static_cast<const MCSymbolELF *>(Asm.getBaseSymbol(Symbol)); // This has to be in sync with when computeSymbolTable uses SHN_ABS or // SHN_COMMON. @@ -446,7 +445,7 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex, const MCSymbolELF *Sym = &Symbol; while (Sym->isVariable()) { if (auto *Expr = dyn_cast<MCSymbolRefExpr>(Sym->getVariableValue())) { - Sym = cast<MCSymbolELF>(&Expr->getSymbol()); + Sym = static_cast<const MCSymbolELF *>(&Expr->getSymbol()); if (!Sym->getSize()) continue; ESize = Sym->getSize(); @@ -523,7 +522,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { // Add the data for the symbols. bool HasLargeSectionIndex = false; for (auto It : llvm::enumerate(Asm.symbols())) { - const auto &Symbol = cast<MCSymbolELF>(It.value()); + auto &Symbol = static_cast<const MCSymbolELF &>(It.value()); if (!isInSymtab(Symbol)) continue; @@ -533,7 +532,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { } ELFSymbolData MSD; - MSD.Symbol = cast<MCSymbolELF>(&Symbol); + MSD.Symbol = static_cast<const MCSymbolELF *>(&Symbol); MSD.Order = It.index(); bool Local = Symbol.getBinding() == ELF::STB_LOCAL; @@ -1175,7 +1174,7 @@ void ELFObjectWriter::executePostLayoutBinding() { // versions declared with @@@ to be renamed. for (const Symver &S : Symvers) { StringRef AliasName = S.Name; - const auto &Symbol = cast<MCSymbolELF>(*S.Sym); + auto &Symbol = static_cast<const MCSymbolELF &>(*S.Sym); size_t Pos = AliasName.find('@'); assert(Pos != StringRef::npos); @@ -1185,8 +1184,8 @@ void ELFObjectWriter::executePostLayoutBinding() { if (Rest.starts_with("@@@")) Tail = Rest.substr(Symbol.isUndefined() ? 2 : 1); - auto *Alias = - cast<MCSymbolELF>(Asm->getContext().getOrCreateSymbol(Prefix + Tail)); + auto *Alias = static_cast<MCSymbolELF *>( + Asm->getContext().getOrCreateSymbol(Prefix + Tail)); Asm->registerSymbol(*Alias); const MCExpr *Value = MCSymbolRefExpr::create(&Symbol, Asm->getContext()); Alias->setVariableValue(Value); @@ -1218,7 +1217,8 @@ void ELFObjectWriter::executePostLayoutBinding() { } for (const MCSymbol *&Sym : AddrsigSyms) { - if (const MCSymbol *R = Renames.lookup(cast<MCSymbolELF>(Sym))) + if (const MCSymbol *R = + Renames.lookup(static_cast<const MCSymbolELF *>(Sym))) Sym = R; if (Sym->isInSection() && Sym->getName().starts_with(".L")) Sym = Sym->getSection().getBeginSymbol(); @@ -1234,7 +1234,7 @@ void ELFObjectWriter::executePostLayoutBinding() { continue; auto *Expr = Alias->getVariableValue(); if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr)) { - auto &Sym = cast<MCSymbolELF>(Inner->getSymbol()); + auto &Sym = static_cast<const MCSymbolELF &>(Inner->getSymbol()); if (Asm->registerSymbol(Sym)) Sym.setBinding(ELF::STB_WEAK); } @@ -1316,7 +1316,7 @@ void ELFObjectWriter::recordRelocation(const MCFragment &F, auto &Section = static_cast<const MCSectionELF &>(*F.getParent()); MCContext &Ctx = getContext(); - const auto *SymA = cast_or_null<MCSymbolELF>(Target.getAddSym()); + auto *SymA = static_cast<const MCSymbolELF *>(Target.getAddSym()); const MCSectionELF *SecA = (SymA && SymA->isInSection()) ? static_cast<const MCSectionELF *>(&SymA->getSection()) @@ -1328,7 +1328,7 @@ void ELFObjectWriter::recordRelocation(const MCFragment &F, uint64_t FixupOffset = Asm->getFragmentOffset(F) + Fixup.getOffset(); uint64_t Addend = Target.getConstant(); if (auto *RefB = Target.getSubSym()) { - const auto &SymB = cast<MCSymbolELF>(*RefB); + auto &SymB = static_cast<const MCSymbolELF &>(*RefB); if (SymB.isUndefined()) { Ctx.reportError(Fixup.getLoc(), Twine("symbol '") + SymB.getName() + @@ -1363,7 +1363,7 @@ void ELFObjectWriter::recordRelocation(const MCFragment &F, !mc::isRelocRelocation(Fixup.getKind()); if (UseSectionSym && useSectionSymbol(Target, SymA, Addend, Type)) { Addend += Asm->getSymbolOffset(*SymA); - SymA = cast<MCSymbolELF>(SecA->getBeginSymbol()); + SymA = static_cast<const MCSymbolELF *>(SecA->getBeginSymbol()); } else if (const MCSymbolELF *R = Renames.lookup(SymA)) { SymA = R; } @@ -1383,7 +1383,7 @@ bool ELFObjectWriter::usesRela(const MCTargetOptions *TO, bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( const MCSymbol &SA, const MCFragment &FB, bool InSet, bool IsPCRel) const { - const auto &SymA = cast<MCSymbolELF>(SA); + auto &SymA = static_cast<const MCSymbolELF &>(SA); if (IsPCRel) { assert(!InSet); if (SymA.getBinding() != ELF::STB_LOCAL || diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 88188f3..3b629cd 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -345,7 +345,7 @@ void GOFFWriter::defineSymbols() { for (const MCSymbol &Sym : Asm.symbols()) { if (Sym.isTemporary()) continue; - auto &Symbol = cast<MCSymbolGOFF>(Sym); + auto &Symbol = static_cast<const MCSymbolGOFF &>(Sym); if (Symbol.hasLDAttributes()) { Symbol.setIndex(++Ordinal); defineLabel(Symbol); diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp index 828d9cf..55ec4a6 100644 --- a/llvm/lib/MC/MCAsmBackend.cpp +++ b/llvm/lib/MC/MCAsmBackend.cpp @@ -8,6 +8,7 @@ #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDXContainerWriter.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCGOFFObjectWriter.h" @@ -122,14 +123,12 @@ void MCAsmBackend::maybeAddReloc(const MCFragment &F, const MCFixup &Fixup, } bool MCAsmBackend::isDarwinCanonicalPersonality(const MCSymbol *Sym) const { + assert(getContext().isMachO()); // Consider a NULL personality (ie., no personality encoding) to be canonical // because it's always at 0. if (!Sym) return true; - if (!Sym->isMachO()) - llvm_unreachable("Expected MachO symbols only"); - StringRef name = Sym->getName(); // XXX: We intentionally leave out "___gcc_personality_v0" because, despite // being system-defined like these two, it is not very commonly-used. diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index da51da4..93614cd 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -897,14 +897,14 @@ void MCAsmStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, // Print symbol's rename (original name contains invalid character(s)) if // there is one. - MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(CsectSym); + auto *XSym = static_cast<MCSymbolXCOFF *>(CsectSym); if (XSym->hasRename()) emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName()); } void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility( MCSymbol *Symbol, MCSymbolAttr Linkage, MCSymbolAttr Visibility) { - + auto &Sym = static_cast<MCSymbolXCOFF &>(*Symbol); switch (Linkage) { case MCSA_Global: OS << MAI->getGlobalDirective(); @@ -944,9 +944,8 @@ void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility( // Print symbol's rename (original name contains invalid character(s)) if // there is one. - if (cast<MCSymbolXCOFF>(Symbol)->hasRename()) - emitXCOFFRenameDirective(Symbol, - cast<MCSymbolXCOFF>(Symbol)->getSymbolTableName()); + if (Sym.hasRename()) + emitXCOFFRenameDirective(&Sym, Sym.getSymbolTableName()); } void MCAsmStreamer::emitXCOFFRenameDirective(const MCSymbol *Name, @@ -1070,9 +1069,11 @@ void MCAsmStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size, // Print symbol's rename (original name contains invalid character(s)) if // there is one. - MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(Symbol); - if (XSym && XSym->hasRename()) - emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName()); + if (getContext().isXCOFF()) { + auto *XSym = static_cast<MCSymbolXCOFF *>(Symbol); + if (XSym && XSym->hasRename()) + emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName()); + } } void MCAsmStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 8500fd1..d172ad1 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -59,7 +59,8 @@ STATISTIC(EmittedFillFragments, "Number of emitted assembler fragments - fill"); STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops"); STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org"); -STATISTIC(evaluateFixup, "Number of evaluated fixups"); +STATISTIC(Fixups, "Number of fixups"); +STATISTIC(FixupEvalForRelax, "Number of fixup evaluations for relaxation"); STATISTIC(ObjectBytes, "Number of emitted object file bytes"); STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps"); STATISTIC(RelaxedInstructions, "Number of relaxed instructions"); @@ -140,9 +141,9 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup, MCValue &Target, uint64_t &Value, - bool RecordReloc, - MutableArrayRef<char> Contents) const { - ++stats::evaluateFixup; + bool RecordReloc, uint8_t *Data) const { + if (RecordReloc) + ++stats::Fixups; // FIXME: This code has some duplication with recordRelocation. We should // probably merge the two into a single callback that tries to evaluate a @@ -185,7 +186,7 @@ bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup, if (IsResolved && mc::isRelocRelocation(Fixup.getKind())) IsResolved = false; - getBackend().applyFixup(F, Fixup, Target, Contents, Value, IsResolved); + getBackend().applyFixup(F, Fixup, Target, Data, Value, IsResolved); return true; } @@ -703,21 +704,25 @@ void MCAssembler::layout() { for (MCFixup &Fixup : F.getFixups()) { uint64_t FixedValue; MCValue Target; + assert(mc::isRelocRelocation(Fixup.getKind()) || + Fixup.getOffset() <= F.getFixedSize()); + auto *Data = + reinterpret_cast<uint8_t *>(Contents.data() + Fixup.getOffset()); evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); + /*RecordReloc=*/true, Data); } - if (F.getVarFixups().size()) { - // In the variable part, fixup offsets are relative to the fixed part's - // start. Extend the variable contents to the left to account for the - // fixed part size. - Contents = MutableArrayRef(F.getParent()->ContentStorage) - .slice(F.VarContentStart - Contents.size(), F.getSize()); - for (MCFixup &Fixup : F.getVarFixups()) { - uint64_t FixedValue; - MCValue Target; - evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); - } + // In the variable part, fixup offsets are relative to the fixed part's + // start. + for (MCFixup &Fixup : F.getVarFixups()) { + uint64_t FixedValue; + MCValue Target; + assert(mc::isRelocRelocation(Fixup.getKind()) || + (Fixup.getOffset() >= F.getFixedSize() && + Fixup.getOffset() <= F.getSize())); + auto *Data = reinterpret_cast<uint8_t *>( + F.getVarContents().data() + (Fixup.getOffset() - F.getFixedSize())); + evaluateFixup(F, Fixup, Target, FixedValue, + /*RecordReloc=*/true, Data); } } } @@ -735,7 +740,7 @@ void MCAssembler::Finish() { bool MCAssembler::fixupNeedsRelaxation(const MCFragment &F, const MCFixup &Fixup) const { - assert(getBackendPtr() && "Expected assembler backend"); + ++stats::FixupEvalForRelax; MCValue Target; uint64_t Value; bool Resolved = evaluateFixup(F, const_cast<MCFixup &>(Fixup), Target, Value, @@ -940,6 +945,14 @@ bool MCAssembler::relaxFill(MCFillFragment &F) { return true; } +bool MCAssembler::relaxOrg(MCOrgFragment &F) { + uint64_t Size = computeFragmentSize(F); + if (F.getSize() == Size) + return false; + F.setSize(Size); + return true; +} + bool MCAssembler::relaxFragment(MCFragment &F) { switch(F.getKind()) { default: @@ -961,6 +974,8 @@ bool MCAssembler::relaxFragment(MCFragment &F) { return relaxCVDefRange(cast<MCCVDefRangeFragment>(F)); case MCFragment::FT_Fill: return relaxFill(cast<MCFillFragment>(F)); + case MCFragment::FT_Org: + return relaxOrg(static_cast<MCOrgFragment &>(F)); } } diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp index 7d528a5..335934a7 100644 --- a/llvm/lib/MC/MCCodeView.cpp +++ b/llvm/lib/MC/MCCodeView.cpp @@ -436,12 +436,11 @@ void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS, const MCSymbol *FnEndSym) { // Create and insert a fragment into the current section that will be encoded // later. - auto *F = MCCtx->allocFragment<MCCVInlineLineTableFragment>( + OS.newSpecialFragment<MCCVInlineLineTableFragment>( PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym); - OS.insert(F); } -MCFragment *CodeViewContext::emitDefRange( +void CodeViewContext::emitDefRange( MCObjectStreamer &OS, ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges, StringRef FixedSizePortion) { @@ -451,9 +450,7 @@ MCFragment *CodeViewContext::emitDefRange( auto &Saved = DefRangeStorage.emplace_back(Ranges.begin(), Ranges.end()); // Create and insert a fragment into the current section that will be encoded // later. - auto *F = MCCtx->allocFragment<MCCVDefRangeFragment>(Saved, FixedSizePortion); - OS.insert(F); - return F; + OS.newSpecialFragment<MCCVDefRangeFragment>(Saved, FixedSizePortion); } static unsigned computeLabelDiff(const MCAssembler &Asm, const MCSymbol *Begin, @@ -695,5 +692,7 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm, } Frag.setVarContents(Contents); + assert(Fixups.size() < 256 && "Store fixups outside of MCFragment's VarFixup " + "storage if the number ever exceeds 256"); Frag.setVarFixups(Fixups); } diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 39bf628..5e364e9 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -153,15 +153,12 @@ void MCContext::reset() { SPIRVAllocator.DestroyAll(); WasmSignatureAllocator.DestroyAll(); - // ~CodeViewContext may destroy a MCFragment outside of sections and need to - // be reset before FragmentAllocator. CVContext.reset(); MCSubtargetAllocator.DestroyAll(); InlineAsmUsedLabelNames.clear(); Symbols.clear(); Allocator.Reset(); - FragmentAllocator.Reset(); Instances.clear(); CompilationDir.clear(); MainFileName.clear(); @@ -297,11 +294,9 @@ MCSymbol *MCContext::createSymbolImpl(const MCSymbolTableEntry *Name, case MCContext::IsDXContainer: break; case MCContext::IsSPIRV: - return new (Name, *this) - MCSymbol(MCSymbol::SymbolKindUnset, Name, IsTemporary); + return new (Name, *this) MCSymbol(Name, IsTemporary); } - return new (Name, *this) - MCSymbol(MCSymbol::SymbolKindUnset, Name, IsTemporary); + return new (Name, *this) MCSymbol(Name, IsTemporary); } MCSymbol *MCContext::cloneSymbol(MCSymbol &Sym) { @@ -309,13 +304,16 @@ MCSymbol *MCContext::cloneSymbol(MCSymbol &Sym) { auto Name = Sym.getNameEntryPtr(); switch (getObjectFileType()) { case MCContext::IsCOFF: - NewSym = new (Name, *this) MCSymbolCOFF(cast<MCSymbolCOFF>(Sym)); + NewSym = + new (Name, *this) MCSymbolCOFF(static_cast<const MCSymbolCOFF &>(Sym)); break; case MCContext::IsELF: - NewSym = new (Name, *this) MCSymbolELF(cast<MCSymbolELF>(Sym)); + NewSym = + new (Name, *this) MCSymbolELF(static_cast<const MCSymbolELF &>(Sym)); break; case MCContext::IsMachO: - NewSym = new (Name, *this) MCSymbolMachO(cast<MCSymbolMachO>(Sym)); + NewSym = new (Name, *this) + MCSymbolMachO(static_cast<const MCSymbolMachO &>(Sym)); break; default: reportFatalUsageError(".set redefinition is not supported"); @@ -446,7 +444,7 @@ Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) { // Use the symbol's index to track if it has been used as a section symbol. // Set to -1 to catch potential bugs if misused as a symbol index. if (Sym && Sym->getIndex() != -1u) { - R = cast<Symbol>(Sym); + R = static_cast<Symbol *>(Sym); } else { SymEntry.second.Used = true; R = new (&SymEntry, *this) Symbol(&SymEntry, /*isTemporary=*/false); @@ -586,7 +584,7 @@ MCContext::createELFRelSection(const Twine &Name, unsigned Type, unsigned Flags, return createELFSectionImpl( I->getKey(), Type, Flags, EntrySize, Group, true, true, - cast<MCSymbolELF>(RelInfoSection->getBeginSymbol())); + static_cast<const MCSymbolELF *>(RelInfoSection->getBeginSymbol())); } MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix, @@ -604,7 +602,7 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type, const MCSymbolELF *LinkedToSym) { MCSymbolELF *GroupSym = nullptr; if (!Group.isTriviallyEmpty() && !Group.str().empty()) - GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group)); + GroupSym = static_cast<MCSymbolELF *>(getOrCreateSymbol(Group)); return getELFSection(Section, Type, Flags, EntrySize, GroupSym, IsComdat, UniqueID, LinkedToSym); @@ -817,7 +815,7 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K, unsigned UniqueID) { MCSymbolWasm *GroupSym = nullptr; if (!Group.isTriviallyEmpty() && !Group.str().empty()) { - GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group)); + GroupSym = static_cast<MCSymbolWasm *>(getOrCreateSymbol(Group)); GroupSym->setComdat(true); if (K.isMetadata() && !GroupSym->getType().has_value()) { // Comdat group symbol associated with a custom section is a section @@ -848,7 +846,7 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind, MCSymbol *Begin = createRenamableSymbol(CachedName, true, false); // Begin always has a different name than CachedName... see #48596. getSymbolTableEntry(Begin->getName()).second.Symbol = Begin; - cast<MCSymbolWasm>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION); + static_cast<MCSymbolWasm *>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION); MCSectionWasm *Result = new (WasmAllocator.Allocate()) MCSectionWasm(CachedName, Kind, Flags, GroupSym, UniqueID, Begin); @@ -889,9 +887,9 @@ MCSectionXCOFF *MCContext::getXCOFFSection( MCSymbolXCOFF *QualName = nullptr; // Debug section don't have storage class attribute. if (IsDwarfSec) - QualName = cast<MCSymbolXCOFF>(getOrCreateSymbol(CachedName)); + QualName = static_cast<MCSymbolXCOFF *>(getOrCreateSymbol(CachedName)); else - QualName = cast<MCSymbolXCOFF>(getOrCreateSymbol( + QualName = static_cast<MCSymbolXCOFF *>(getOrCreateSymbol( CachedName + "[" + XCOFF::getMappingClassString(CsectProp->MappingClass) + "]")); diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 38744a0..275e76e 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -59,7 +59,7 @@ void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) { } void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); MCObjectStreamer::emitLabel(Symbol, Loc); const MCSectionELF &Section = @@ -70,7 +70,7 @@ void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F, uint64_t Offset) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset); const MCSectionELF &Section = @@ -95,7 +95,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { } void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) { - auto *A = cast<MCSymbolELF>(Alias); + auto *A = static_cast<MCSymbolELF *>(Alias); if (A->isDefined()) { getContext().reportError(getStartTokLoc(), "symbol '" + A->getName() + "' is already defined"); @@ -126,7 +126,7 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) { } bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // Adding a symbol attribute always introduces the symbol, note that an // important side effect of calling registerSymbol here is to register @@ -247,7 +247,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) { void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); getAssembler().registerSymbol(*Symbol); if (!Symbol->isBindingSet()) @@ -272,12 +272,12 @@ void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, " redeclared as different type"); } - cast<MCSymbolELF>(Symbol) - ->setSize(MCConstantExpr::create(Size, getContext())); + static_cast<MCSymbolELF *>(Symbol)->setSize( + MCConstantExpr::create(Size, getContext())); } void MCELFStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) { - cast<MCSymbolELF>(Symbol)->setSize(Value); + static_cast<MCSymbolELF *>(Symbol)->setSize(Value); } void MCELFStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym, @@ -289,7 +289,7 @@ void MCELFStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym, void MCELFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // FIXME: Should this be caught and done earlier? getAssembler().registerSymbol(*Symbol); Symbol->setBinding(ELF::STB_LOCAL); diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index a214513..6226b02 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -147,7 +147,7 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) { void MCMachOStreamer::emitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) { - auto *Sym = cast<MCSymbolMachO>(Symbol); + auto *Sym = static_cast<const MCSymbolMachO *>(Symbol); getAssembler().registerSymbol(*Symbol); if (Symbol->isExternal()) emitSymbolAttribute(EHSymbol, MCSA_Global); @@ -160,7 +160,7 @@ void MCMachOStreamer::emitEHSymAttributes(const MCSymbol *Symbol, void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // We have to create a new fragment if this is an atom defining symbol, // fragments cannot span atoms. - if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible()) + if (static_cast<MCSymbolMachO *>(Symbol)->isSymbolLinkerVisible()) newFragment(); MCObjectStreamer::emitLabel(Symbol, Loc); @@ -172,7 +172,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // // FIXME: Cleanup this code, these bits should be emitted based on semantic // properties, not on the order of definition, etc. - cast<MCSymbolMachO>(Symbol)->clearReferenceType(); + static_cast<MCSymbolMachO *>(Symbol)->clearReferenceType(); } void MCMachOStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { @@ -182,7 +182,7 @@ void MCMachOStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { if (const auto *SymA = Res.getAddSym()) { if (!Res.getSubSym() && (SymA->getName().empty() || Res.getConstant() != 0)) - cast<MCSymbolMachO>(Symbol)->setAltEntry(); + static_cast<MCSymbolMachO *>(Symbol)->setAltEntry(); } } MCObjectStreamer::emitAssignment(Symbol, Value); @@ -256,7 +256,7 @@ void MCMachOStreamer::emitDarwinTargetVariantBuildVersion( bool MCMachOStreamer::emitSymbolAttribute(MCSymbol *Sym, MCSymbolAttr Attribute) { - MCSymbolMachO *Symbol = cast<MCSymbolMachO>(Sym); + auto *Symbol = static_cast<MCSymbolMachO *>(Sym); // Indirect symbols are handled differently, to match how 'as' handles // them. This makes writing matching .o files easier. @@ -367,7 +367,7 @@ bool MCMachOStreamer::emitSymbolAttribute(MCSymbol *Sym, void MCMachOStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) { // Encode the 'desc' value into the lowest implementation defined bits. getAssembler().registerSymbol(*Symbol); - cast<MCSymbolMachO>(Symbol)->setDesc(DescValue); + static_cast<MCSymbolMachO *>(Symbol)->setDesc(DescValue); } void MCMachOStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size, @@ -430,7 +430,7 @@ void MCMachOStreamer::finishImpl() { // defining symbols. DenseMap<const MCFragment *, const MCSymbol *> DefiningSymbolMap; for (const MCSymbol &Symbol : getAssembler().symbols()) { - auto &Sym = cast<MCSymbolMachO>(Symbol); + auto &Sym = static_cast<const MCSymbolMachO &>(Symbol); if (Sym.isSymbolLinkerVisible() && Sym.isInSection() && !Sym.isVariable() && !Sym.isAltEntry()) { // An atom defining symbol should never be internal to a fragment. diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 393eed1..4ac73ab 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -1135,9 +1135,10 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const { Flags |= ELF::SHF_GROUP; } - return Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, Flags, 0, - GroupName, true, ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec.getBeginSymbol())); + return Ctx->getELFSection( + ".callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, true, + ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } MCSection * @@ -1154,9 +1155,10 @@ MCObjectFileInfo::getStackSizesSection(const MCSection &TextSec) const { Flags |= ELF::SHF_GROUP; } - return Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, Flags, 0, - GroupName, true, ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec.getBeginSymbol())); + return Ctx->getELFSection( + ".stack_sizes", ELF::SHT_PROGBITS, Flags, 0, GroupName, true, + ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } MCSection * @@ -1174,9 +1176,10 @@ MCObjectFileInfo::getBBAddrMapSection(const MCSection &TextSec) const { // Use the text section's begin symbol and unique ID to create a separate // .llvm_bb_addr_map section associated with every unique text section. - return Ctx->getELFSection(".llvm_bb_addr_map", ELF::SHT_LLVM_BB_ADDR_MAP, - Flags, 0, GroupName, true, ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec.getBeginSymbol())); + return Ctx->getELFSection( + ".llvm_bb_addr_map", ELF::SHT_LLVM_BB_ADDR_MAP, Flags, 0, GroupName, true, + ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } MCSection * @@ -1192,10 +1195,10 @@ MCObjectFileInfo::getKCFITrapSection(const MCSection &TextSec) const { Flags |= ELF::SHF_GROUP; } - return Ctx->getELFSection(".kcfi_traps", ELF::SHT_PROGBITS, Flags, 0, - GroupName, - /*IsComdat=*/true, ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec.getBeginSymbol())); + return Ctx->getELFSection( + ".kcfi_traps", ELF::SHT_PROGBITS, Flags, 0, GroupName, + /*IsComdat=*/true, ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } MCSection * @@ -1211,9 +1214,10 @@ MCObjectFileInfo::getPseudoProbeSection(const MCSection &TextSec) const { Flags |= ELF::SHF_GROUP; } - return Ctx->getELFSection(PseudoProbeSection->getName(), ELF::SHT_PROGBITS, - Flags, 0, GroupName, true, ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec.getBeginSymbol())); + return Ctx->getELFSection( + PseudoProbeSection->getName(), ELF::SHT_PROGBITS, Flags, 0, GroupName, + true, ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } MCSection * @@ -1261,7 +1265,7 @@ MCSection *MCObjectFileInfo::getPCSection(StringRef Name, GroupName = Group->getName(); Flags |= ELF::SHF_GROUP; } - return Ctx->getELFSection(Name, ELF::SHT_PROGBITS, Flags, 0, GroupName, true, - ElfSec.getUniqueID(), - cast<MCSymbolELF>(TextSec->getBeginSymbol())); + return Ctx->getELFSection( + Name, ELF::SHT_PROGBITS, Flags, 0, GroupName, true, ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec->getBeginSymbol())); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index e277143..8c27958 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -46,27 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } +constexpr size_t FragBlockSize = 16384; +// Ensure the new fragment can at least store a few bytes. +constexpr size_t NewFragHeadroom = 8; + +static_assert(NewFragHeadroom >= alignof(MCFragment)); +static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom); + +MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) { + auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom); + FragSpace = Size - sizeof(MCFragment); + auto Block = std::unique_ptr<uint8_t[]>(new uint8_t[Size]); + auto *F = reinterpret_cast<MCFragment *>(Block.get()); + FragStorage.push_back(std::move(Block)); + return F; +} + void MCObjectStreamer::newFragment() { - addFragment(getContext().allocFragment<MCFragment>()); + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + addFragment(F); } -void MCObjectStreamer::insert(MCFragment *F) { - assert(F->getKind() != MCFragment::FT_Data && - "F should have a variable-size tail"); +void MCObjectStreamer::ensureHeadroom(size_t Headroom) { + if (Headroom <= FragSpace) + return; + auto *F = allocFragSpace(Headroom); + new (F) MCFragment(); + addFragment(F); +} + +void MCObjectStreamer::addSpecialFragment(MCFragment *Frag) { + assert(Frag->getKind() != MCFragment::FT_Data && + "Frag should have a variable-size tail"); + // Frag is not connected to FragSpace. Before modifying CurFrag with + // addFragment(Frag), allocate an empty fragment to maintain FragSpace + // connectivity, potentially reusing CurFrag's associated space. + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + + addFragment(Frag); addFragment(F); - newFragment(); } void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { - CurFrag->appendContents(Contents); + ensureHeadroom(Contents.size()); + assert(FragSpace >= Contents.size()); + llvm::copy(Contents, getCurFragEnd()); + CurFrag->FixedSize += Contents.size(); + FragSpace -= Contents.size(); } -void MCObjectStreamer::appendContents(size_t Num, char Elt) { - CurFrag->appendContents(Num, Elt); +void MCObjectStreamer::appendContents(size_t Num, uint8_t Elt) { + ensureHeadroom(Num); + MutableArrayRef<uint8_t> Data(getCurFragEnd(), Num); + llvm::fill(Data, Elt); + CurFrag->FixedSize += Num; + FragSpace -= Num; } void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) { - CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind)); + CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind)); } // As a compile-time optimization, avoid allocating and evaluating an MCExpr @@ -115,6 +171,9 @@ void MCObjectStreamer::reset() { } EmitEHFrame = true; EmitDebugFrame = false; + FragStorage.clear(); + FragSpace = 0; + SpecialFragAllocator.Reset(); MCStreamer::reset(); } @@ -143,7 +202,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) { MCStreamer::emitValueImpl(Value, Size, Loc); - MCFragment *DF = getCurrentFragment(); MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -158,9 +216,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, emitIntValue(AbsValue, Size); return; } - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - MCFixup::getDataKindForSize(Size))); - DF->appendContents(Size, 0); + ensureHeadroom(Size); + addFixup(Value, MCFixup::getDataKindForSize(Size)); + appendContents(Size, 0); } MCSymbol *MCObjectStreamer::emitCFILabel() { @@ -194,7 +252,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // section. MCFragment *F = CurFrag; Symbol->setFragment(F); - Symbol->setOffset(F->getContents().size()); + Symbol->setOffset(F->getFixedSize()); emitPendingAssignments(Symbol); } @@ -260,6 +318,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { F0 = CurFrag; } + // To maintain connectivity between CurFrag and FragSpace when CurFrag is + // modified, allocate an empty fragment and append it to the fragment list. + // (Subsections[I].second.Tail is not connected to FragSpace.) + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + F->setParent(Section); + auto &Subsections = Section->Subsections; size_t I = 0, E = Subsections.size(); while (I != E && Subsections[I].first < Subsection) @@ -267,13 +340,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // If the subsection number is not in the sorted Subsections list, create a // new fragment list. if (I == E || Subsections[I].first != Subsection) { - auto *F = getContext().allocFragment<MCFragment>(); - F->setParent(Section); Subsections.insert(Subsections.begin() + I, {Subsection, MCSection::FragList{F, F}}); + Section->CurFragList = &Subsections[I].second; + CurFrag = F; + } else { + Section->CurFragList = &Subsections[I].second; + CurFrag = Subsections[I].second.Tail; + // Ensure CurFrag is associated with FragSpace. + addFragment(F); } - Section->CurFragList = &Subsections[I].second; - CurFrag = Section->CurFragList->Tail; // Define the section symbol at subsection 0's initial fragment if required. if (!NewSec) @@ -344,11 +420,15 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, MCFragment *F = getCurrentFragment(); // Append the instruction to the data fragment. - size_t CodeOffset = F->getContents().size(); + size_t CodeOffset = getCurFragSize(); + SmallString<16> Content; SmallVector<MCFixup, 1> Fixups; - getAssembler().getEmitter().encodeInstruction( - Inst, F->getContentsForAppending(), Fixups, STI); - F->doneAppending(); + getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI); + appendContents(Content); + if (CurFrag != F) { + F = CurFrag; + CodeOffset = 0; + } F->setHasInstructions(STI); if (Fixups.empty()) @@ -570,7 +650,7 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment, void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) { - insert(getContext().allocFragment<MCOrgFragment>(*Offset, Value, Loc)); + newSpecialFragment<MCOrgFragment>(*Offset, Value, Loc); } void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, @@ -602,8 +682,7 @@ void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) { assert(getCurrentSectionOnly() && "need a section"); - insert( - getContext().allocFragment<MCFillFragment>(FillValue, 1, NumBytes, Loc)); + newSpecialFragment<MCFillFragment>(FillValue, 1, NumBytes, Loc); } void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size, @@ -630,15 +709,13 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size, // Otherwise emit as fragment. assert(getCurrentSectionOnly() && "need a section"); - insert( - getContext().allocFragment<MCFillFragment>(Expr, Size, NumValues, Loc)); + newSpecialFragment<MCFillFragment>(Expr, Size, NumValues, Loc); } void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength, SMLoc Loc, const MCSubtargetInfo &STI) { assert(getCurrentSectionOnly() && "need a section"); - insert(getContext().allocFragment<MCNopsFragment>( - NumBytes, ControlledNopLength, Loc, STI)); + newSpecialFragment<MCNopsFragment>(NumBytes, ControlledNopLength, Loc, STI); } void MCObjectStreamer::emitFileDirective(StringRef Filename) { diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 9f64a98..7782dc1 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1865,7 +1865,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, } if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc && - Sym->isExternal() && !cast<MCSymbolMachO>(Sym)->isAltEntry()) + Sym->isExternal() && !static_cast<MCSymbolMachO *>(Sym)->isAltEntry()) return Error(StartTokLoc, "non-private labels cannot appear between " ".cfi_startproc / .cfi_endproc pairs") && Error(*CFIStartProcLoc, "previous .cfi_startproc was here"); @@ -6273,7 +6273,8 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef, // used as a symbol, or it is an absolute symbol). Sym = Parser.getContext().lookupSymbol(Name); if (Sym) { - if (!Sym->isUnset() && (!allow_redef || !Sym->isRedefinable())) + if ((Sym->isVariable() || Sym->isDefined()) && + (!allow_redef || !Sym->isRedefinable())) return Parser.Error(EqualLoc, "redefinition of '" + Name + "'"); // If the symbol is redefinable, clone it and update the symbol table // to the new symbol. Existing references to the original symbol remain diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index 282f22f..229b0b8 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -460,7 +460,8 @@ bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) { nextLoc = getTok().getLoc(); } } - MCSymbolCOFF *Sym = cast<MCSymbolCOFF>(getContext().getOrCreateSymbol(Label)); + auto *Sym = + static_cast<MCSymbolCOFF *>(getContext().getOrCreateSymbol(Label)); // Define symbol as simple external function Sym->setExternal(true); diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 2e251cc..6782c4b 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -200,7 +200,7 @@ bool ELFAsmParser::parseDirectiveSize(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier"); - MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name)); + auto *Sym = static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(Name)); if (getLexer().isNot(AsmToken::Comma)) return TokError("expected comma"); @@ -466,7 +466,7 @@ bool ELFAsmParser::parseLinkedToSym(MCSymbolELF *&LinkedToSym) { } return TokError("invalid linked-to symbol"); } - LinkedToSym = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name)); + LinkedToSym = static_cast<MCSymbolELF *>(getContext().lookupSymbol(Name)); if (!LinkedToSym || !LinkedToSym->isInSection()) return Error(StartLoc, "linked-to symbol is not in a section: " + Name); return false; diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index d97f4f5..6c2d241 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -224,7 +224,7 @@ public: return true; if (expect(AsmToken::EndOfStatement, "eol")) return true; - auto WasmSym = cast<MCSymbolWasm>(Sym); + auto WasmSym = static_cast<const MCSymbolWasm *>(Sym); if (WasmSym->isFunction()) { // Ignore .size directives for function symbols. They get their size // set automatically based on their content. @@ -241,9 +241,9 @@ public: if (!Lexer->is(AsmToken::Identifier)) return error("Expected label after .type directive, got: ", Lexer->getTok()); - auto WasmSym = cast<MCSymbolWasm>( - getStreamer().getContext().getOrCreateSymbol( - Lexer->getTok().getString())); + auto *WasmSym = static_cast<MCSymbolWasm *>( + getStreamer().getContext().getOrCreateSymbol( + Lexer->getTok().getString())); Lex(); if (!(isNext(AsmToken::Comma) && isNext(AsmToken::At) && Lexer->is(AsmToken::Identifier))) diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 4f28267..27ca131 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -83,12 +83,14 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) { } void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) { + assert(Fixups.size() < 256 && + "variable-size tail cannot have more than 256 fixups"); auto &S = getParent()->FixupStorage; - if (VarFixupStart + Fixups.size() > VarFixupEnd) { + if (Fixups.size() > VarFixupSize) { VarFixupStart = S.size(); S.resize_for_overwrite(S.size() + Fixups.size()); } - VarFixupEnd = VarFixupStart + Fixups.size(); + VarFixupSize = Fixups.size(); // Source fixup offsets are relative to the variable part's start. Add the // fixed part size to make them relative to the fixed part's start. std::transform(Fixups.begin(), Fixups.end(), S.begin() + VarFixupStart, diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp index e3ef111..9c8b224 100644 --- a/llvm/lib/MC/MCWasmStreamer.cpp +++ b/llvm/lib/MC/MCWasmStreamer.cpp @@ -36,7 +36,7 @@ using namespace llvm; MCWasmStreamer::~MCWasmStreamer() = default; // anchor. void MCWasmStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { - auto *Symbol = cast<MCSymbolWasm>(S); + auto *Symbol = static_cast<MCSymbolWasm *>(S); MCObjectStreamer::emitLabel(Symbol, Loc); const MCSectionWasm &Section = @@ -47,7 +47,7 @@ void MCWasmStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F, uint64_t Offset) { - auto *Symbol = cast<MCSymbolWasm>(S); + auto *Symbol = static_cast<MCSymbolWasm *>(S); MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset); const MCSectionWasm &Section = @@ -69,8 +69,7 @@ void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { bool MCWasmStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) { assert(Attribute != MCSA_IndirectSymbol && "indirect symbols not supported"); - - auto *Symbol = cast<MCSymbolWasm>(S); + auto *Symbol = static_cast<MCSymbolWasm *>(S); // Adding a symbol attribute always introduces the symbol; note that an // important side effect of calling registerSymbol here is to register the @@ -135,7 +134,7 @@ void MCWasmStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, } void MCWasmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) { - cast<MCSymbolWasm>(Symbol)->setSize(Value); + static_cast<MCSymbolWasm *>(Symbol)->setSize(Value); } void MCWasmStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size, diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index 72a8dd7..a87648a 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Emit the epilog instructions. if (EnableUnwindV2) { + // Ensure the fixups and appended content apply to the same fragment. + OS->ensureHeadroom(info->EpilogMap.size() * 2); + bool IsLast = true; for (const auto &Epilog : llvm::reverse(info->EpilogMap)) { if (IsLast) { diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 1ffe25c..a45936b 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -163,13 +163,13 @@ void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { } void MCWinCOFFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { - auto *Symbol = cast<MCSymbolCOFF>(S); + auto *Symbol = static_cast<MCSymbolCOFF *>(S); MCObjectStreamer::emitLabel(Symbol, Loc); } bool MCWinCOFFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) { - auto *Symbol = cast<MCSymbolCOFF>(S); + auto *Symbol = static_cast<MCSymbolCOFF *>(S); getAssembler().registerSymbol(*Symbol); switch (Attribute) { @@ -199,11 +199,10 @@ void MCWinCOFFStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) { } void MCWinCOFFStreamer::beginCOFFSymbolDef(MCSymbol const *S) { - auto *Symbol = cast<MCSymbolCOFF>(S); if (CurSymbol) Error("starting a new symbol definition without completing the " "previous one"); - CurSymbol = Symbol; + CurSymbol = static_cast<MCSymbolCOFF *>(const_cast<MCSymbol *>(S)); } void MCWinCOFFStreamer::emitCOFFSymbolStorageClass(int StorageClass) { @@ -219,7 +218,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolStorageClass(int StorageClass) { } getAssembler().registerSymbol(*CurSymbol); - cast<MCSymbolCOFF>(CurSymbol)->setClass((uint16_t)StorageClass); + static_cast<MCSymbolCOFF *>(CurSymbol)->setClass((uint16_t)StorageClass); } void MCWinCOFFStreamer::emitCOFFSymbolType(int Type) { @@ -234,7 +233,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolType(int Type) { } getAssembler().registerSymbol(*CurSymbol); - cast<MCSymbolCOFF>(CurSymbol)->setType((uint16_t)Type); + static_cast<const MCSymbolCOFF *>(CurSymbol)->setType((uint16_t)Type); } void MCWinCOFFStreamer::endCOFFSymbolDef() { @@ -249,7 +248,7 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) { if (getContext().getTargetTriple().getArch() != Triple::x86) return; - const MCSymbolCOFF *CSymbol = cast<MCSymbolCOFF>(Symbol); + auto *CSymbol = static_cast<const MCSymbolCOFF *>(Symbol); if (CSymbol->isSafeSEH()) return; @@ -258,7 +257,7 @@ void MCWinCOFFStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) { switchSection(SXData); SXData->ensureMinAlignment(Align(4)); - insert(getContext().allocFragment<MCSymbolIdFragment>(Symbol)); + newSpecialFragment<MCSymbolIdFragment>(Symbol); getAssembler().registerSymbol(*Symbol); CSymbol->setIsSafeSEH(); @@ -273,13 +272,14 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) { MCSection *Sec = getCurrentSectionOnly(); Sec->ensureMinAlignment(Align(4)); - insert(getContext().allocFragment<MCSymbolIdFragment>(Symbol)); + newSpecialFragment<MCSymbolIdFragment>(Symbol); getAssembler().registerSymbol(*Symbol); } void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) { visitUsedSymbol(*Symbol); const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); + ensureHeadroom(2); addFixup(SRE, FK_SecRel_2); appendContents(2, 0); } @@ -293,6 +293,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_SecRel_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -308,6 +309,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -318,6 +320,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { // Create Symbol for section number. const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( *Symbol, this->getWriter(), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -328,6 +331,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { // Create Symbol for section offset. const MCExpr *MCE = MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -335,7 +339,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { - auto *Symbol = cast<MCSymbolCOFF>(S); + auto *Symbol = static_cast<MCSymbolCOFF *>(S); const Triple &T = getContext().getTargetTriple(); if (T.isWindowsMSVCEnvironment()) { @@ -367,7 +371,7 @@ void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, void MCWinCOFFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { - auto *Symbol = cast<MCSymbolCOFF>(S); + auto *Symbol = static_cast<MCSymbolCOFF *>(S); MCSection *Section = getContext().getObjectFileInfo()->getBSSSection(); pushSection(); @@ -382,7 +386,7 @@ void MCWinCOFFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size, // Hack: Used by llvm-ml to implement the alias directive. void MCWinCOFFStreamer::emitWeakReference(MCSymbol *AliasS, const MCSymbol *Symbol) { - auto *Alias = cast<MCSymbolCOFF>(AliasS); + auto *Alias = static_cast<MCSymbolCOFF *>(AliasS); emitSymbolAttribute(Alias, MCSA_Weak); Alias->setIsWeakExternal(true); @@ -410,7 +414,7 @@ void MCWinCOFFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From, void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) { const MCSymbol *S = &SRE->getSymbol(); if (getAssembler().registerSymbol(*S)) - cast<MCSymbolCOFF>(S)->setExternal(true); + static_cast<const MCSymbolCOFF *>(S)->setExternal(true); } void MCWinCOFFStreamer::finishImpl() { diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 26f45ce..a0e3dba 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -52,7 +52,7 @@ void MCXCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym, MCSymbolAttr Attribute) { - auto *Symbol = cast<MCSymbolXCOFF>(Sym); + auto *Symbol = static_cast<MCSymbolXCOFF *>(Sym); getAssembler().registerSymbol(*Symbol); switch (Attribute) { @@ -109,7 +109,7 @@ void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) { void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name, StringRef Rename) { - const MCSymbolXCOFF *Symbol = cast<const MCSymbolXCOFF>(Name); + auto *Symbol = static_cast<const MCSymbolXCOFF *>(Name); if (!Symbol->hasRename()) report_fatal_error("Only explicit .rename is supported for XCOFF."); } @@ -129,15 +129,14 @@ void MCXCOFFStreamer::emitXCOFFCInfoSym(StringRef Name, StringRef Metadata) { void MCXCOFFStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size, Align ByteAlignment) { + auto *Sym = static_cast<MCSymbolXCOFF *>(Symbol); getAssembler().registerSymbol(*Symbol); - Symbol->setExternal(cast<MCSymbolXCOFF>(Symbol)->getStorageClass() != - XCOFF::C_HIDEXT); + Symbol->setExternal(Sym->getStorageClass() != XCOFF::C_HIDEXT); Symbol->setCommon(Size, ByteAlignment); // Default csect align is 4, but common symbols have explicit alignment values // and we should honor it. - cast<MCSymbolXCOFF>(Symbol)->getRepresentedCsect()->setAlignment( - ByteAlignment); + Sym->getRepresentedCsect()->setAlignment(ByteAlignment); // Emit the alignment and storage for the variable to the section. emitValueToAlignment(ByteAlignment); diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index e87696a..eb59e39 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -72,7 +72,7 @@ bool MachObjectWriter::doesSymbolRequireExternRelocation(const MCSymbol &S) { // References to weak definitions require external relocation entries; the // definition may not always be the one in the same object file. - if (cast<MCSymbolMachO>(S).isWeakDefinition()) + if (static_cast<const MCSymbolMachO &>(S).isWeakDefinition()) return true; // Otherwise, we can use an internal relocation. @@ -383,15 +383,16 @@ const MCSymbol &MachObjectWriter::findAliasedSymbol(const MCSymbol &Sym) const { } void MachObjectWriter::writeNlist(MachSymbolData &MSD, const MCAssembler &Asm) { - const MCSymbol *Symbol = MSD.Symbol; - const auto &Data = cast<MCSymbolMachO>(*Symbol); - const MCSymbol *AliasedSymbol = &findAliasedSymbol(*Symbol); + auto *Symbol = static_cast<const MCSymbolMachO *>(MSD.Symbol); + const auto &Data = static_cast<const MCSymbolMachO &>(*Symbol); + auto *AliasedSymbol = + static_cast<const MCSymbolMachO *>(&findAliasedSymbol(*Symbol)); uint8_t SectionIndex = MSD.SectionIndex; uint8_t Type = 0; uint64_t Address = 0; bool IsAlias = Symbol != AliasedSymbol; - const MCSymbol &OrigSymbol = *Symbol; + const MCSymbolMachO &OrigSymbol = *Symbol; MachSymbolData *AliaseeInfo; if (IsAlias) { AliaseeInfo = findSymbolData(*AliasedSymbol); @@ -441,9 +442,8 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD, const MCAssembler &Asm) { // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc' // value. - bool EncodeAsAltEntry = - IsAlias && cast<MCSymbolMachO>(OrigSymbol).isAltEntry(); - W.write<uint16_t>(cast<MCSymbolMachO>(Symbol)->getEncodedFlags(EncodeAsAltEntry)); + bool EncodeAsAltEntry = IsAlias && OrigSymbol.isAltEntry(); + W.write<uint16_t>(Symbol->getEncodedFlags(EncodeAsAltEntry)); if (is64Bit()) W.write<uint64_t>(Address); else @@ -570,7 +570,8 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // // FIXME: Do not hardcode. if (Asm.registerSymbol(*ISD.Symbol)) - cast<MCSymbolMachO>(ISD.Symbol)->setReferenceTypeUndefinedLazy(true); + static_cast<MCSymbolMachO *>(ISD.Symbol) + ->setReferenceTypeUndefinedLazy(true); } } @@ -588,7 +589,7 @@ void MachObjectWriter::computeSymbolTable( // Build the string table. for (const MCSymbol &Symbol : Asm.symbols()) { - if (!cast<MCSymbolMachO>(Symbol).isSymbolLinkerVisible()) + if (!static_cast<const MCSymbolMachO &>(Symbol).isSymbolLinkerVisible()) continue; StringTable.add(Symbol.getName()); @@ -602,7 +603,7 @@ void MachObjectWriter::computeSymbolTable( // important for letting us diff .o files. for (const MCSymbol &Symbol : Asm.symbols()) { // Ignore non-linker visible symbols. - if (!cast<MCSymbolMachO>(Symbol).isSymbolLinkerVisible()) + if (!static_cast<const MCSymbolMachO &>(Symbol).isSymbolLinkerVisible()) continue; if (!Symbol.isExternal() && !Symbol.isUndefined()) @@ -628,7 +629,7 @@ void MachObjectWriter::computeSymbolTable( // Now add the data for local symbols. for (const MCSymbol &Symbol : Asm.symbols()) { // Ignore non-linker visible symbols. - if (!cast<MCSymbolMachO>(Symbol).isSymbolLinkerVisible()) + if (!static_cast<const MCSymbolMachO &>(Symbol).isSymbolLinkerVisible()) continue; if (Symbol.isExternal() || Symbol.isUndefined()) diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index bfd6334..af009a4 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -487,7 +487,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, bool IsLocRel = false; if (const auto *RefB = Target.getSubSym()) { - const auto &SymB = cast<MCSymbolWasm>(*RefB); + auto &SymB = static_cast<const MCSymbolWasm &>(*RefB); if (FixupSection.isText()) { Ctx.reportError(Fixup.getLoc(), @@ -515,7 +515,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, } // We either rejected the fixup or folded B into C at this point. - const auto *SymA = cast<MCSymbolWasm>(Target.getAddSym()); + auto *SymA = static_cast<const MCSymbolWasm *>(Target.getAddSym()); // The .init_array isn't translated as data, so don't do relocations in it. if (FixupSection.getName().starts_with(".init_array")) { @@ -561,7 +561,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, report_fatal_error("section symbol is required for relocation"); C += Asm->getSymbolOffset(*SymA); - SymA = cast<MCSymbolWasm>(SectionSymbol); + SymA = static_cast<const MCSymbolWasm *>(SectionSymbol); } if (Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB || @@ -573,7 +573,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, // TABLE_INDEX relocs implicitly use the default indirect function table. // We require the function table to have already been defined. auto TableName = "__indirect_function_table"; - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(TableName)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(TableName)); if (!Sym) { report_fatal_error("missing indirect function table symbol"); } else { @@ -631,8 +631,8 @@ WasmObjectWriter::getProvisionalValue(const MCAssembler &Asm, case wasm::R_WASM_TABLE_INDEX_I32: case wasm::R_WASM_TABLE_INDEX_I64: { // Provisional value is table address of the resolved symbol itself - const MCSymbolWasm *Base = - cast<MCSymbolWasm>(Asm.getBaseSymbol(*RelEntry.Symbol)); + auto *Base = + static_cast<const MCSymbolWasm *>(Asm.getBaseSymbol(*RelEntry.Symbol)); assert(Base->isFunction()); if (RelEntry.Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB || RelEntry.Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB64) @@ -1342,11 +1342,11 @@ void WasmObjectWriter::prepareImports( // Register types for all functions, including those with private linkage // (because wasm always needs a type signature). if (WS.isFunction()) { - const auto *BS = Asm.getBaseSymbol(S); + auto *BS = static_cast<const MCSymbolWasm *>(Asm.getBaseSymbol(S)); if (!BS) report_fatal_error(Twine(S.getName()) + ": absolute addressing not supported!"); - registerFunctionType(*cast<MCSymbolWasm>(BS)); + registerFunctionType(*BS); } if (WS.isTag()) @@ -1516,10 +1516,10 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, // For user-defined custom sections, strip the prefix Name.consume_front(".custom_section."); - MCSymbol *Begin = Sec.getBeginSymbol(); + auto *Begin = static_cast<MCSymbolWasm *>(Sec.getBeginSymbol()); if (Begin) { - assert(WasmIndices.count(cast<MCSymbolWasm>(Begin)) == 0); - WasmIndices[cast<MCSymbolWasm>(Begin)] = CustomSections.size(); + assert(WasmIndices.count(Begin) == 0); + WasmIndices[Begin] = CustomSections.size(); } // Separate out the producers and target features sections @@ -1719,7 +1719,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, if (!BS) report_fatal_error(Twine(S.getName()) + ": absolute addressing not supported!"); - const MCSymbolWasm *Base = cast<MCSymbolWasm>(BS); + const MCSymbolWasm *Base = static_cast<const MCSymbolWasm *>(BS); // Find the target symbol of this weak alias and export that index const auto &WS = static_cast<const MCSymbolWasm &>(S); @@ -1829,8 +1829,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, Rel.Type != wasm::R_WASM_TABLE_INDEX_REL_SLEB64) return; assert(Rel.Symbol->isFunction()); - const MCSymbolWasm *Base = - cast<MCSymbolWasm>(Asm.getBaseSymbol(*Rel.Symbol)); + auto *Base = + static_cast<const MCSymbolWasm *>(Asm.getBaseSymbol(*Rel.Symbol)); uint32_t FunctionIndex = WasmIndices.find(Base)->second; uint32_t TableIndex = TableElems.size() + InitialTableOffset; if (TableIndices.try_emplace(Base, TableIndex).second) { @@ -1880,7 +1880,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, if (!SymRef) report_fatal_error( "fixups in .init_array should be symbol references"); - const auto &TargetSym = cast<const MCSymbolWasm>(SymRef->getSymbol()); + auto &TargetSym = + static_cast<const MCSymbolWasm &>(SymRef->getSymbol()); if (TargetSym.getIndex() == InvalidIndex) report_fatal_error("symbols in .init_array should exist in symtab"); if (!TargetSym.isFunction()) @@ -1905,7 +1906,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, writeExportSection(Exports); const MCSymbol *IndirectFunctionTable = getContext().lookupSymbol("__indirect_function_table"); - writeElemSection(cast_or_null<const MCSymbolWasm>(IndirectFunctionTable), + writeElemSection(static_cast<const MCSymbolWasm *>(IndirectFunctionTable), TableElems); writeDataCountSection(); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 856850d..0cc5ff5 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -382,7 +382,8 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) { COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym); COFFSymbol *Local = nullptr; - if (cast<MCSymbolCOFF>(MCSym).getWeakExternalCharacteristics()) { + if (static_cast<const MCSymbolCOFF &>(MCSym) + .getWeakExternalCharacteristics()) { Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; Sym->Section = nullptr; @@ -406,7 +407,8 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) { Sym->Aux[0].AuxType = ATWeakExternal; Sym->Aux[0].Aux.WeakExternal.TagIndex = 0; // Filled in later Sym->Aux[0].Aux.WeakExternal.Characteristics = - cast<MCSymbolCOFF>(MCSym).getWeakExternalCharacteristics(); + static_cast<const MCSymbolCOFF &>(MCSym) + .getWeakExternalCharacteristics(); } else { if (!Base) Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE; @@ -418,7 +420,7 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) { if (Local) { Local->Data.Value = getSymbolValue(MCSym, *Asm); - const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym); + auto &SymbolCOFF = static_cast<const MCSymbolCOFF &>(MCSym); Local->Data.Type = SymbolCOFF.getType(); Local->Data.StorageClass = SymbolCOFF.getClass(); @@ -821,7 +823,8 @@ void WinCOFFWriter::executePostLayoutBinding() { for (const MCSymbol &Symbol : Asm->symbols()) // Define non-temporary or temporary static (private-linkage) symbols if (!Symbol.isTemporary() || - cast<MCSymbolCOFF>(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC) + static_cast<const MCSymbolCOFF &>(Symbol).getClass() == + COFF::IMAGE_SYM_CLASS_STATIC) defineSymbol(Symbol); UseBigObj = Sections.size() > COFF::MaxNumberOfSections16; @@ -1188,7 +1191,7 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( // point to thunks, and the /GUARD:CF flag assumes that it can use relocations // to approximate the set of all address taken functions. LLD's implementation // of /GUARD:CF also relies on the existance of these relocations. - uint16_t Type = cast<MCSymbolCOFF>(SymA).getType(); + uint16_t Type = static_cast<const MCSymbolCOFF &>(SymA).getType(); if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION) return false; return &SymA.getSection() == FB.getParent(); diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 65f543b..13917ba 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -591,7 +591,7 @@ void XCOFFWriter::executePostLayoutBinding() { if (S.isTemporary()) continue; - const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S); + auto *XSym = static_cast<const MCSymbolXCOFF *>(&S); const MCSectionXCOFF *ContainingCsect = getContainingCsect(XSym); if (ContainingCsect->isDwarfSect()) @@ -690,7 +690,8 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, std::tie(Type, SignAndSize) = TargetObjectWriter->getRelocTypeAndSignSize( Target, Fixup, Fixup.isPCRel()); - const MCSectionXCOFF *SymASec = getContainingCsect(cast<MCSymbolXCOFF>(SymA)); + const MCSectionXCOFF *SymASec = + getContainingCsect(static_cast<const MCSymbolXCOFF *>(SymA)); assert(SectionMap.contains(SymASec) && "Expected containing csect to exist in map."); @@ -773,13 +774,13 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, "Expected containing csect to exist in map."); SectionMap[RelocationSec]->Relocations.push_back(Reloc); - const MCSymbol *const SymB = Target.getSubSym(); + auto SymB = static_cast<const MCSymbolXCOFF *>(Target.getSubSym()); if (!SymB) return; if (SymA == SymB) report_fatal_error("relocation for opposite term is not yet supported"); - const MCSectionXCOFF *SymBSec = getContainingCsect(cast<MCSymbolXCOFF>(SymB)); + const MCSectionXCOFF *SymBSec = getContainingCsect(SymB); assert(SectionMap.contains(SymBSec) && "Expected containing csect to exist in map."); if (SymASec == SymBSec) diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp index 2dfae8a..da2a7bb 100644 --- a/llvm/lib/Object/Binary.cpp +++ b/llvm/lib/Object/Binary.cpp @@ -75,6 +75,7 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer, case file_magic::xcoff_object_32: case file_magic::xcoff_object_64: case file_magic::wasm_object: + case file_magic::dxcontainer_object: return ObjectFile::createSymbolicFile(Buffer, Type, Context, InitContent); case file_magic::macho_universal_binary: return MachOUniversalBinary::create(Buffer); @@ -87,7 +88,6 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer, case file_magic::clang_ast: case file_magic::cuda_fatbinary: case file_magic::coff_cl_gl_object: - case file_magic::dxcontainer_object: case file_magic::offload_bundle: case file_magic::offload_bundle_compressed: case file_magic::spirv_object: diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 0b46ff7..031b941 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -11,6 +11,7 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Endian.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/TargetParser/SubtargetFeature.h" using namespace llvm; using namespace llvm::object; @@ -515,3 +516,183 @@ uint8_t DirectX::PSVRuntimeInfo::getSigPatchOrPrimCount() const { return P->SigPatchOrPrimElements; return 0; } + +class DXNotSupportedError : public ErrorInfo<DXNotSupportedError> { +public: + static char ID; + + DXNotSupportedError(StringRef S) : FeatureString(S) {} + + void log(raw_ostream &OS) const override { + OS << "DXContainer does not support " << FeatureString; + } + + std::error_code convertToErrorCode() const override { + return inconvertibleErrorCode(); + } + +private: + StringRef FeatureString; +}; + +char DXNotSupportedError::ID = 0; + +Expected<section_iterator> +DXContainerObjectFile::getSymbolSection(DataRefImpl Symb) const { + return make_error<DXNotSupportedError>("Symbol sections"); +} + +Expected<StringRef> DXContainerObjectFile::getSymbolName(DataRefImpl) const { + return make_error<DXNotSupportedError>("Symbol names"); +} + +Expected<uint64_t> +DXContainerObjectFile::getSymbolAddress(DataRefImpl Symb) const { + return make_error<DXNotSupportedError>("Symbol addresses"); +} + +uint64_t DXContainerObjectFile::getSymbolValueImpl(DataRefImpl Symb) const { + llvm_unreachable("DXContainer does not support symbols"); +} +uint64_t +DXContainerObjectFile::getCommonSymbolSizeImpl(DataRefImpl Symb) const { + llvm_unreachable("DXContainer does not support symbols"); +} + +Expected<SymbolRef::Type> +DXContainerObjectFile::getSymbolType(DataRefImpl Symb) const { + return make_error<DXNotSupportedError>("Symbol types"); +} + +void DXContainerObjectFile::moveSectionNext(DataRefImpl &Sec) const { + PartIterator It = reinterpret_cast<PartIterator>(Sec.p); + if (It == Parts.end()) + return; + + ++It; + Sec.p = reinterpret_cast<uintptr_t>(It); +} + +Expected<StringRef> +DXContainerObjectFile::getSectionName(DataRefImpl Sec) const { + PartIterator It = reinterpret_cast<PartIterator>(Sec.p); + return StringRef(It->Part.getName()); +} + +uint64_t DXContainerObjectFile::getSectionAddress(DataRefImpl Sec) const { + PartIterator It = reinterpret_cast<PartIterator>(Sec.p); + return It->Offset; +} + +uint64_t DXContainerObjectFile::getSectionIndex(DataRefImpl Sec) const { + return (Sec.p - reinterpret_cast<uintptr_t>(Parts.begin())) / + sizeof(PartIterator); +} + +uint64_t DXContainerObjectFile::getSectionSize(DataRefImpl Sec) const { + PartIterator It = reinterpret_cast<PartIterator>(Sec.p); + return It->Data.size(); +} +Expected<ArrayRef<uint8_t>> +DXContainerObjectFile::getSectionContents(DataRefImpl Sec) const { + PartIterator It = reinterpret_cast<PartIterator>(Sec.p); + return ArrayRef<uint8_t>(It->Data.bytes_begin(), It->Data.size()); +} + +uint64_t DXContainerObjectFile::getSectionAlignment(DataRefImpl Sec) const { + return 1; +} + +bool DXContainerObjectFile::isSectionCompressed(DataRefImpl Sec) const { + return false; +} + +bool DXContainerObjectFile::isSectionText(DataRefImpl Sec) const { + return false; +} + +bool DXContainerObjectFile::isSectionData(DataRefImpl Sec) const { + return false; +} + +bool DXContainerObjectFile::isSectionBSS(DataRefImpl Sec) const { + return false; +} + +bool DXContainerObjectFile::isSectionVirtual(DataRefImpl Sec) const { + return false; +} + +relocation_iterator +DXContainerObjectFile::section_rel_begin(DataRefImpl Sec) const { + return relocation_iterator(RelocationRef()); +} + +relocation_iterator +DXContainerObjectFile::section_rel_end(DataRefImpl Sec) const { + return relocation_iterator(RelocationRef()); +} + +void DXContainerObjectFile::moveRelocationNext(DataRefImpl &Rel) const { + llvm_unreachable("DXContainer does not support relocations"); +} + +uint64_t DXContainerObjectFile::getRelocationOffset(DataRefImpl Rel) const { + llvm_unreachable("DXContainer does not support relocations"); +} + +symbol_iterator +DXContainerObjectFile::getRelocationSymbol(DataRefImpl Rel) const { + return symbol_iterator(SymbolRef()); +} + +uint64_t DXContainerObjectFile::getRelocationType(DataRefImpl Rel) const { + llvm_unreachable("DXContainer does not support relocations"); +} + +void DXContainerObjectFile::getRelocationTypeName( + DataRefImpl Rel, SmallVectorImpl<char> &Result) const { + llvm_unreachable("DXContainer does not support relocations"); +} + +section_iterator DXContainerObjectFile::section_begin() const { + DataRefImpl Sec; + Sec.p = reinterpret_cast<uintptr_t>(Parts.begin()); + return section_iterator(SectionRef(Sec, this)); +} +section_iterator DXContainerObjectFile::section_end() const { + DataRefImpl Sec; + Sec.p = reinterpret_cast<uintptr_t>(Parts.end()); + return section_iterator(SectionRef(Sec, this)); +} + +uint8_t DXContainerObjectFile::getBytesInAddress() const { return 4; } + +StringRef DXContainerObjectFile::getFileFormatName() const { + return "DirectX Container"; +} + +Triple::ArchType DXContainerObjectFile::getArch() const { return Triple::dxil; } + +Expected<SubtargetFeatures> DXContainerObjectFile::getFeatures() const { + return SubtargetFeatures(); +} + +Error DXContainerObjectFile::printSymbolName(raw_ostream &OS, + DataRefImpl Symb) const { + return make_error<DXNotSupportedError>("Symbol names"); +} + +Expected<uint32_t> +DXContainerObjectFile::getSymbolFlags(DataRefImpl Symb) const { + return make_error<DXNotSupportedError>("Symbol flags"); +} + +Expected<std::unique_ptr<DXContainerObjectFile>> +ObjectFile::createDXContainerObjectFile(MemoryBufferRef Object) { + auto ExC = DXContainer::create(Object); + if (!ExC) + return ExC.takeError(); + std::unique_ptr<DXContainerObjectFile> Obj(new DXContainerObjectFile(*ExC)); + return std::move(Obj); +} diff --git a/llvm/lib/Object/Object.cpp b/llvm/lib/Object/Object.cpp index c62944a..112927e 100644 --- a/llvm/lib/Object/Object.cpp +++ b/llvm/lib/Object/Object.cpp @@ -124,6 +124,8 @@ LLVMBinaryType LLVMBinaryGetType(LLVMBinaryRef BR) { return LLVMBinaryTypeOffload; case ID_Wasm: return LLVMBinaryTypeWasm; + case ID_DXContainer: + return LLVMBinaryTypeDXcontainer; case ID_StartObjects: case ID_EndObjects: llvm_unreachable("Marker types are not valid binary kinds!"); diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp index 6a226a3..b0e4ea0 100644 --- a/llvm/lib/Object/ObjectFile.cpp +++ b/llvm/lib/Object/ObjectFile.cpp @@ -15,6 +15,7 @@ #include "llvm/BinaryFormat/Magic.h" #include "llvm/Object/Binary.h" #include "llvm/Object/COFF.h" +#include "llvm/Object/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Object/MachO.h" #include "llvm/Object/Wasm.h" @@ -165,7 +166,6 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type, case file_magic::goff_object: case file_magic::cuda_fatbinary: case file_magic::offload_binary: - case file_magic::dxcontainer_object: case file_magic::offload_bundle: case file_magic::offload_bundle_compressed: case file_magic::spirv_object: @@ -201,6 +201,8 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type, return createXCOFFObjectFile(Object, Binary::ID_XCOFF64); case file_magic::wasm_object: return createWasmObjectFile(Object); + case file_magic::dxcontainer_object: + return createDXContainerObjectFile(Object); } llvm_unreachable("Unexpected Object File Type"); } diff --git a/llvm/lib/Object/SymbolicFile.cpp b/llvm/lib/Object/SymbolicFile.cpp index e87ecb1..47295e6 100644 --- a/llvm/lib/Object/SymbolicFile.cpp +++ b/llvm/lib/Object/SymbolicFile.cpp @@ -68,6 +68,7 @@ SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type, case file_magic::xcoff_object_32: case file_magic::xcoff_object_64: case file_magic::wasm_object: + case file_magic::dxcontainer_object: return ObjectFile::createObjectFile(Object, Type, InitContent); case file_magic::coff_import_library: return std::unique_ptr<SymbolicFile>(new COFFImportFile(Object)); @@ -123,6 +124,7 @@ bool SymbolicFile::isSymbolicFile(file_magic Type, const LLVMContext *Context) { case file_magic::elf_relocatable: case file_magic::macho_object: case file_magic::coff_object: + case file_magic::dxcontainer_object: return true; default: return false; diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index bd3964c..5425729 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1160,8 +1160,7 @@ void getValueForSiteInstrProf(const void *R, InstrProfValueData *Dst, } ValueProfData *allocValueProfDataInstrProf(size_t TotalSizeInBytes) { - ValueProfData *VD = - (ValueProfData *)(new (::operator new(TotalSizeInBytes)) ValueProfData()); + ValueProfData *VD = new (::operator new(TotalSizeInBytes)) ValueProfData(); memset(VD, 0, TotalSizeInBytes); return VD; } diff --git a/llvm/lib/SandboxIR/Value.cpp b/llvm/lib/SandboxIR/Value.cpp index e39bbc4..94b4a4c 100644 --- a/llvm/lib/SandboxIR/Value.cpp +++ b/llvm/lib/SandboxIR/Value.cpp @@ -22,7 +22,7 @@ Value::Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx) Value::use_iterator Value::use_begin() { llvm::Use *LLVMUse = nullptr; - if (Val->use_begin() != Val->use_end()) + if (!Val->uses().empty()) LLVMUse = &*Val->use_begin(); User *User = LLVMUse != nullptr ? cast_or_null<sandboxir::User>(Ctx.getValue( Val->use_begin()->getUser())) diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index ed3b149..1914f4c 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -306,7 +306,7 @@ void BalancedPartitioning::split(const FunctionNodeRange Nodes, unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; - llvm::sort(Nodes.begin(), Nodes.end(), [](auto &L, auto &R) { + llvm::sort(Nodes, [](auto &L, auto &R) { return L.InputOrderIndex < R.InputOrderIndex; }); diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index e8d3161..082de56 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -597,6 +597,14 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { return Thunk; } +std::optional<std::string> getArm64ECMangledFunctionName(GlobalValue &GV) { + if (!GV.hasName()) { + GV.setName("__unnamed"); + } + + return llvm::getArm64ECMangledFunctionName(GV.getName()); +} + // Builds the "guest exit thunk", a helper to call a function which may or may // not be an exit thunk. (We optimistically assume non-dllimport function // declarations refer to functions defined in AArch64 code; if the linker @@ -608,7 +616,7 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { getThunkType(F->getFunctionType(), F->getAttributes(), Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty, ArgTranslations); - auto MangledName = getArm64ECMangledFunctionName(F->getName().str()); + auto MangledName = getArm64ECMangledFunctionName(*F); assert(MangledName && "Can't guest exit to function that's already native"); std::string ThunkName = *MangledName; if (ThunkName[0] == '?' && ThunkName.find("@") != std::string::npos) { @@ -790,7 +798,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { if (!F) continue; if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(A.getName().str())) { + getArm64ECMangledFunctionName(A)) { F->addMetadata("arm64ec_unmangled_name", *MDNode::get(M->getContext(), MDString::get(M->getContext(), A.getName()))); @@ -807,7 +815,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts()); if (PersFn->getValueType() && PersFn->getValueType()->isFunctionTy()) { if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(PersFn->getName().str())) { + getArm64ECMangledFunctionName(*PersFn)) { PersFn->setName(MangledName.value()); } } @@ -821,7 +829,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { // Rename hybrid patchable functions and change callers to use a global // alias instead. if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(F.getName().str())) { + getArm64ECMangledFunctionName(F)) { std::string OrigName(F.getName()); F.setName(MangledName.value() + HybridPatchableTargetSuffix); @@ -927,7 +935,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // FIXME: Handle functions with weak linkage? if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(F.getName().str())) { + getArm64ECMangledFunctionName(F)) { F.addMetadata("arm64ec_unmangled_name", *MDNode::get(M->getContext(), MDString::get(M->getContext(), F.getName()))); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ea63edd8..8887657 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -887,6 +887,10 @@ private: bool shouldScalarizeBinop(SDValue VecOp) const override { return VecOp.getOpcode() == ISD::SETCC; } + + bool hasMultipleConditionRegisters(EVT VT) const override { + return VT.isScalableVector(); + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index ba7cbcc..5a537f2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6484,7 +6484,9 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, bits<2> sz, bits<4> opc, string a (OpNode (AccumType RegType:$Rd), (InputType RegType:$Rn), (InputType RegType:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); + + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> { @@ -6507,7 +6509,8 @@ class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm (OpNode (AccumType RegType:$Rd), (InputType RegType:$Rn), (InputType RegType:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); let Inst{13} = b13; } @@ -8986,7 +8989,8 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1, (InputType RegType:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # - ", $Rm" # kind2 # "}"); + ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorBFDot<bit U, string asm> { @@ -9032,7 +9036,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}"); } let mayRaiseFPException = 1, Uses = [FPCR] in @@ -9071,8 +9075,7 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", - ", $Rm", ".8h", "}"); + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}"); } let mayRaiseFPException = 1, Uses = [FPCR] in @@ -9143,7 +9146,7 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]> { - let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; } //---------------------------------------------------------------------------- @@ -13344,8 +13347,8 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, V128, asm, ".16b", []> { - let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn", ".16b", - ", $Rm", ".16b", "}"); + let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", + "|", kind, "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e1adc0b..9f05add 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3092,6 +3092,13 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as + // we use fcvtx under SVE2. Give them invalid costs. + if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() && + ISD == ISD::FP_ROUND && SrcTy.isScalableVector() && + DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64) + return InstructionCost::getInvalid(); + static const TypeConversionCostTblEntry BF16Tbl[] = { {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt @@ -3100,6 +3107,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1 + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1 + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp }; if (ST->hasBF16()) @@ -3508,11 +3521,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1}, {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3}, + // Truncate from nxvmf32 to nxvmbf16. + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8}, + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8}, + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17}, + // Truncate from nxvmf64 to nxvmf16. {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1}, {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3}, {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7}, + // Truncate from nxvmf64 to nxvmbf16. + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9}, + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19}, + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39}, + // Truncate from nxvmf64 to nxvmf32. {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1}, {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3}, @@ -3523,11 +3546,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, + // Extend from nxvmbf16 to nxvmf32. + {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl + {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl + {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl + // Extend from nxvmf16 to nxvmf64. {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, + // Extend from nxvmbf16 to nxvmf64. + {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt + {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt + {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt + // Extend from nxvmf32 to nxvmf64. {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, @@ -4282,10 +4315,9 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const { - int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register // width. TODO: Improve this with different cost kinds. - if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { + if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) { // We would need this many instructions to hide the scalarization happening. const int AmortizationCost = 20; @@ -4315,55 +4347,72 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( return LT.first; } - static const TypeConversionCostTblEntry - VectorSelectTbl[] = { - { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, - { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, - { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, - { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, - { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, - { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } - }; + static const TypeConversionCostTblEntry VectorSelectTbl[] = { + {Instruction::Select, MVT::v2i1, MVT::v2f32, 2}, + {Instruction::Select, MVT::v2i1, MVT::v2f64, 2}, + {Instruction::Select, MVT::v4i1, MVT::v4f32, 2}, + {Instruction::Select, MVT::v4i1, MVT::v4f16, 2}, + {Instruction::Select, MVT::v8i1, MVT::v8f16, 2}, + {Instruction::Select, MVT::v16i1, MVT::v16i16, 16}, + {Instruction::Select, MVT::v8i1, MVT::v8i32, 8}, + {Instruction::Select, MVT::v16i1, MVT::v16i32, 16}, + {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost}, + {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost}, + {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}}; EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode, SelCondTy.getSimpleVT(), SelValTy.getSimpleVT())) return Entry->Cost; } } - if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { - Type *ValScalarTy = ValTy->getScalarType(); - if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) || - ValScalarTy->isBFloatTy()) { - auto *ValVTy = cast<FixedVectorType>(ValTy); - - // Without dedicated instructions we promote [b]f16 compares to f32. - auto *PromotedTy = - VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy); - - InstructionCost Cost = 0; - // Promote operands to float vectors. - Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - // Compare float vectors. + if (Opcode == Instruction::FCmp) { + // Without dedicated instructions we promote f16 + bf16 compares to f32. + if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || + ValTy->getScalarType()->isBFloatTy()) { + Type *PromotedTy = + ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); + InstructionCost Cost = + getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, Op1Info, Op2Info); - // During codegen we'll truncate the vector result from i32 to i16. - Cost += - getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy), - VectorType::getInteger(PromotedTy), - TTI::CastContextHint::None, CostKind); + if (ValTy->isVectorTy()) + Cost += getCastInstrCost( + Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); return Cost; } + + auto LT = getTypeLegalizationCost(ValTy); + // Model unknown fp compares as a libcall. + if (LT.second.getScalarType() != MVT::f64 && + LT.second.getScalarType() != MVT::f32 && + LT.second.getScalarType() != MVT::f16) + return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy, + {ValTy, ValTy}, CostKind); + + // Some comparison operators require expanding to multiple compares + or. + unsigned Factor = 1; + if (!CondTy->isVectorTy() && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ)) + Factor = 2; // fcmp with 2 selects + else if (isa<FixedVectorType>(ValTy) && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ || + VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO)) + Factor = 3; // fcmxx+fcmyy+or + else if (isa<ScalableVectorType>(ValTy) && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ)) + Factor = 3; // fcmxx+fcmyy+or + + return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first); } // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to @@ -4371,7 +4420,7 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds // providing it will not cause performance regressions. if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() && - ISD == ISD::SETCC && I && !CmpInst::isUnsigned(VecPred) && + Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) && TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && match(I->getOperand(0), m_And(m_Value(), m_Value()))) { if (match(I->getOperand(1), m_Zero())) @@ -6235,10 +6284,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } } - auto ShouldSinkCondition = [](Value *Cond) -> bool { + auto ShouldSinkCondition = [](Value *Cond, + SmallVectorImpl<Use *> &Ops) -> bool { + if (!isa<IntrinsicInst>(Cond)) + return false; auto *II = dyn_cast<IntrinsicInst>(Cond); - return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && - isa<ScalableVectorType>(II->getOperand(0)->getType()); + if (II->getIntrinsicID() != Intrinsic::vector_reduce_or || + !isa<ScalableVectorType>(II->getOperand(0)->getType())) + return false; + if (isa<CmpInst>(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + return true; }; switch (I->getOpcode()) { @@ -6254,7 +6310,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } break; case Instruction::Select: { - if (!ShouldSinkCondition(I->getOperand(0))) + if (!ShouldSinkCondition(I->getOperand(0), Ops)) return false; Ops.push_back(&I->getOperandUse(0)); @@ -6264,7 +6320,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( if (cast<BranchInst>(I)->isUnconditional()) return false; - if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition())) + if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops)) return false; Ops.push_back(&I->getOperandUse(0)); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 6912caf..7a2b679 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -79,8 +79,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -421,9 +420,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { } void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -460,8 +458,8 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); @@ -471,15 +469,16 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (FulleSizeInBytes == 0) { // Handle as little-endian for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } else { // Handle as big-endian - assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FulleSizeInBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = FulleSizeInBytes - 1 - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } @@ -492,9 +491,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) - Data[Offset + 3] &= ~(1 << 6); + Data[3] &= ~(1 << 6); else - Data[Offset + 3] |= (1 << 6); + Data[3] |= (1 << 6); } } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 7618a57..45ac023 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -96,8 +96,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup, case AArch64::S_TPREL: case AArch64::S_TLSDESC: case AArch64::S_TLSDESC_AUTH: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -488,7 +488,8 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, // this global needs to be tagged. In addition, the linker needs to know // whether to emit a special addend when relocating `end` symbols, and this // can only be determined by the attributes of the symbol itself. - if (Val.getAddSym() && cast<MCSymbolELF>(Val.getAddSym())->isMemtag()) + if (Val.getAddSym() && + static_cast<const MCSymbolELF *>(Val.getAddSym())->isMemtag()) return true; if ((Val.getSpecifier() & AArch64::S_GOT) == AArch64::S_GOT) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 6257e99..14547e3 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -418,7 +418,8 @@ private: } MCSymbol *emitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); return Symbol; } @@ -455,7 +456,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) { getStreamer().getAssembler().registerSymbol(*Symbol); - cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); + static_cast<MCSymbolELF *>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); } void AArch64TargetELFStreamer::finish() { @@ -541,7 +542,7 @@ void AArch64TargetELFStreamer::finish() { MCSectionELF *MemtagSec = nullptr; for (const MCSymbol &Symbol : Asm.symbols()) { - const auto &Sym = cast<MCSymbolELF>(Symbol); + auto &Sym = static_cast<const MCSymbolELF &>(Symbol); if (Sym.isMemtag()) { MemtagSec = Ctx.getELFSection(".memtag.globals.static", ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0); @@ -556,7 +557,7 @@ void AArch64TargetELFStreamer::finish() { S.switchSection(MemtagSec); const auto *Zero = MCConstantExpr::create(0, Ctx); for (const MCSymbol &Symbol : Asm.symbols()) { - const auto &Sym = cast<MCSymbolELF>(Symbol); + auto &Sym = static_cast<const MCSymbolELF &>(Symbol); if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8a0c4ac..18f3c47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts", "Has v_tanh_f32/f16 instructions" >; +def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts", + "HasTensorCvtLutInsts", + "true", + "Has v_perm_pk16* instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureDPPSrc1SGPR, FeatureBitOp3Insts, FeatureTanhInsts, + FeatureTensorCvtLutInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, AssemblerPredicate<(all_of FeatureTanhInsts)>; +def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">, + AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 992572f..394a143 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,18 +51,6 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; -def gi_vop3pmodsneg : - GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">, - GIComplexPatternEquiv<VOP3PModsNeg>; - -def gi_vop3pmodsnegs : - GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">, - GIComplexPatternEquiv<VOP3PModsNegs>; - -def gi_dotiuvop3pmodsnegabs : - GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">, - GIComplexPatternEquiv<VOP3PModsNegAbs>; - def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; @@ -452,6 +440,13 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_VOP3PModsNeg : GICustomOperandRenderer<"renderVOP3PModsNeg">, + GISDNodeXFormEquiv<VOP3PModsNeg>; +def gi_VOP3PModsNegs : GICustomOperandRenderer<"renderVOP3PModsNegs">, + GISDNodeXFormEquiv<VOP3PModsNegs>; +def gi_VOP3PModsNegAbs : GICustomOperandRenderer<"renderVOP3PModsNegAbs">, + GISDNodeXFormEquiv<VOP3PModsNegAbs>; + def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, GISDNodeXFormEquiv<PrefetchLoc>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 39b4200..fb83388 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3449,63 +3449,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } -// Select neg_lo from the i1 immediate operand. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // 1 promotes packed values to signed, 0 treats them as unsigned. - assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); - - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcSign = C->getZExtValue(); - if (SrcSign == 1) - Mods ^= SISrcMods::NEG; - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - -// Select both neg_lo and neg_hi from the i1 immediate operand. This is -// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies -// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // 1 promotes packed values to signed, 0 treats them as unsigned. - assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); - - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcSign = C->getZExtValue(); - if (SrcSign == 1) - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - -// Select neg, abs, or both neg and abs from the i16 immediate operans. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcMod = C->getZExtValue(); - switch (SrcMod) { - default: // Any other value will be silently ignored (considered as 0). - break; - case 1: - Mods ^= SISrcMods::NEG; - break; - case 2: - Mods ^= SISrcMods::ABS; - break; - case 3: - Mods ^= (SISrcMods::NEG | SISrcMods::ABS); - break; - } - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 983f1aa..16388e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -241,9 +241,6 @@ private: bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; - bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const; - bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 31c4f62..7771f9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand); + + setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand); + + setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand); + setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); @@ -589,14 +601,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); - // FIXME: This is only partially true. If we have to do vector compares, any - // SGPR pair can be a condition register. If we have a uniform condition, we - // are better off doing SALU operations, where there is only one SCC. For now, - // we don't have a way of knowing during instruction selection if a condition - // will be uniform and we always use vector compares. Assume we are using - // vector compares until that is fixed. - setHasMultipleConditionRegisters(true); - setMinCmpXchgSizeInBits(32); setSupportsUnalignedAtomics(false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 39bb0ad..fd5d5b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -388,6 +388,16 @@ public: MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + bool hasMultipleConditionRegisters(EVT VT) const override { + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For + // now, we don't have a way of knowing during instruction selection if a + // condition will be uniform and we always use vector compares. Assume we + // are using vector compares until that is fixed. + return true; + } }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index f2207ff..4fe5d00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1694,7 +1694,9 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } - case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: { Value *Src0 = II.getArgOperand(1); Value *Src1 = II.getArgOperand(3); unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b0d3b12..b7fd131 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4988,66 +4988,6 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } -// Select neg_lo from the i1 immediate operand. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // Value is in Imm operand as i1 sign extended to int64_t. - // 1(-1) promotes packed values to signed, 0 treats them as unsigned. - assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && - "expected i1 value"); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Root.getImm() == -1) - Mods ^= SISrcMods::NEG; - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - -// Select both neg_lo and neg_hi from the i1 immediate operand. This is -// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies -// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const { - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // Value is in Imm operand as i1 sign extended to int64_t. - // 1(-1) promotes packed values to signed, 0 treats them as unsigned. - assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && - "expected i1 value"); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Root.getImm() == -1) - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - -// Select neg, abs, or both neg and abs from the i16 immediate operans. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const { - - assert(Root.isImm() && "Modifier for C must be an immediate"); - - unsigned Mods = SISrcMods::OP_SEL_1; - switch (Root.getImm()) { - default: // Any other value will be silently ignored (considered as 0). - break; - case 1: - Mods ^= SISrcMods::NEG; - break; - case 2: - Mods ^= SISrcMods::ABS; - break; - case 3: - Mods ^= (SISrcMods::NEG | SISrcMods::ABS); - break; - } - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { @@ -7102,6 +7042,38 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (MI.getOperand(OpIdx).getImm()) + Mods ^= SISrcMods::NEG; + MIB.addImm((int64_t)Mods); +} + +void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (MI.getOperand(OpIdx).getImm()) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + MIB.addImm((int64_t)Mods); +} + +void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Val = MI.getOperand(OpIdx).getImm(); + unsigned Mods = SISrcMods::OP_SEL_1; // default: none + if (Val == 1) // neg + Mods ^= SISrcMods::NEG; + if (Val == 2) // abs + Mods ^= SISrcMods::ABS; + if (Val == 3) // neg and abs + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + MIB.addImm((int64_t)Mods); +} + void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 140e753..c9da419 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -200,13 +200,6 @@ private: selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectVOP3PModsNeg(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectVOP3PModsNegs(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectVOP3PModsNegAbs(MachineOperand &Root) const; - - InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -419,6 +412,13 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderVOP3PModsNeg(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderVOP3PModsNegs(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderVOP3PModsNegAbs(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index d443f4e..2d8f259 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -236,7 +236,7 @@ cl::opt<LoweringKind> LoweringKindLoc( "Lower via mixture of above strategies"))); template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { - llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { + llvm::sort(V, [](const auto *L, const auto *R) { return L->getName() < R->getName(); }); return {std::move(V)}; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5aa0ebf..74230a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4603,6 +4603,42 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -4762,6 +4798,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: @@ -4777,6 +4815,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: + case Intrinsic::amdgcn_perm_pk16_b4_u4: + case Intrinsic::amdgcn_perm_pk16_b6_u4: + case Intrinsic::amdgcn_perm_pk16_b8_u4: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a83caa0..d33765d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -178,6 +178,10 @@ public: ImmTyBitOp3, ImmTyMatrixAFMT, ImmTyMatrixBFMT, + ImmTyMatrixAScale, + ImmTyMatrixBScale, + ImmTyMatrixAScaleFmt, + ImmTyMatrixBScaleFmt, ImmTyMatrixAReuse, ImmTyMatrixBReuse, ImmTyScaleSel, @@ -428,6 +432,10 @@ public: bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); } bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); } + bool isMatrixAScale() const { return isImmTy(ImmTyMatrixAScale); } + bool isMatrixBScale() const { return isImmTy(ImmTyMatrixBScale); } + bool isMatrixAScaleFmt() const { return isImmTy(ImmTyMatrixAScaleFmt); } + bool isMatrixBScaleFmt() const { return isImmTy(ImmTyMatrixBScaleFmt); } bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -1183,6 +1191,10 @@ public: case ImmTyBitOp3: OS << "BitOp3"; break; case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break; case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; + case ImmTyMatrixAScale: OS << "ImmTyMatrixAScale"; break; + case ImmTyMatrixBScale: OS << "ImmTyMatrixBScale"; break; + case ImmTyMatrixAScaleFmt: OS << "ImmTyMatrixAScaleFmt"; break; + case ImmTyMatrixBScaleFmt: OS << "ImmTyMatrixBScaleFmt"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; case ImmTyScaleSel: OS << "ScaleSel" ; break; @@ -1728,6 +1740,14 @@ public: AMDGPUOperand::ImmTy Type); ParseStatus parseMatrixAFMT(OperandVector &Operands); ParseStatus parseMatrixBFMT(OperandVector &Operands); + ParseStatus tryParseMatrixScale(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScale(OperandVector &Operands); + ParseStatus parseMatrixBScale(OperandVector &Operands); + ParseStatus tryParseMatrixScaleFmt(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScaleFmt(OperandVector &Operands); + ParseStatus parseMatrixBScaleFmt(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -7356,6 +7376,42 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) { AMDGPUOperand::ImmTyMatrixBFMT); } +ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_a_scale", + AMDGPUOperand::ImmTyMatrixAScale); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_b_scale", + AMDGPUOperand::ImmTyMatrixBScale); +} + +ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, + {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"}, + Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_a_scale_fmt", + AMDGPUOperand::ImmTyMatrixAScaleFmt); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_b_scale_fmt", + AMDGPUOperand::ImmTyMatrixBScaleFmt); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9489,6 +9545,34 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, AMDGPUOperand::ImmTyMatrixBFMT, 0); } + int MatrixAScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale); + if (MatrixAScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScale, 0); + } + + int MatrixBScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale); + if (MatrixBScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScale, 0); + } + + int MatrixAScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale_fmt); + if (MatrixAScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScaleFmt, 0); + } + + int MatrixBScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale_fmt); + if (MatrixBScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScaleFmt, 0); + } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyMatrixAReuse, 0); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ffe6b06..fef0d7e 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -598,6 +598,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 // encodings + if (isGFX1250() && Bytes.size() >= 16) { + DecoderUInt128 DecW = eat16Bytes(Bytes); + if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS)) + break; + Bytes = Bytes_.slice(0, MaxInstBytesNum); + } + if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6fe3abc..c84ba1a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -236,6 +236,7 @@ protected: bool Has64BitLiterals = false; bool HasBitOp3Insts = false; bool HasTanhInsts = false; + bool HasTensorCvtLutInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1411,6 +1412,8 @@ public: bool hasTanhInsts() const { return HasTanhInsts; } + bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 86d56855..4e4660c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -33,8 +33,7 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -129,9 +128,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -148,13 +146,13 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } std::optional<MCFixupKind> diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 42c4d8b..ee8683a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1393,6 +1393,75 @@ void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo, printMatrixFMT(MI, OpNo, STI, O, 'b'); } +void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 1; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW0: + O << "MATRIX_SCALE_ROW0"; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW1: + O << "MATRIX_SCALE_ROW1"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'b'); +} + +void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 3; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale_fmt:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8: + O << "MATRIX_SCALE_FMT_E8"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3: + O << "MATRIX_SCALE_FMT_E5M3"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3: + O << "MATRIX_SCALE_FMT_E4M3"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'b'); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index f6739b14..be32061c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -140,6 +140,19 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printMatrixBFMT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, char AorB); + void printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, + char AorB); + void printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index ffdac8b..fa0c95f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -75,8 +75,9 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { if (STI->hasFeature(AMDGPU::FeatureNSAEncoding)) return 20; - // VOP3PX encoding. - if (STI->hasFeature(AMDGPU::FeatureGFX950Insts)) + // VOP3PX/VOP3PX2 encoding. + if (STI->hasFeature(AMDGPU::FeatureGFX950Insts) || + STI->hasFeature(AMDGPU::FeatureGFX1250Insts)) return 16; // 64-bit instruction with 32-bit literal. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 43ca548..68302f0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -872,14 +872,14 @@ void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) { void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) { - MCSymbolELF *Symbol = cast<MCSymbolELF>( + auto *Symbol = static_cast<MCSymbolELF *>( getStreamer().getContext().getOrCreateSymbol(SymbolName)); Symbol->setType(Type); } void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) { - MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol); + auto *SymbolELF = static_cast<MCSymbolELF *>(Symbol); SymbolELF->setType(ELF::STT_OBJECT); if (!SymbolELF->isBindingSet()) @@ -974,9 +974,9 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); - MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>( - Context.getOrCreateSymbol(Twine(KernelName))); - MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>( + auto *KernelCodeSymbol = + static_cast<MCSymbolELF *>(Context.getOrCreateSymbol(Twine(KernelName))); + auto *KernelDescriptorSymbol = static_cast<MCSymbolELF *>( Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd"))); // Copy kernel descriptor symbol's binding, other and visibility from the diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index c564145..deadb7a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1018,6 +1018,17 @@ enum MatrixFMT : unsigned { MATRIX_FMT_BF6 = 3, MATRIX_FMT_FP4 = 4 }; + +enum MatrixScale : unsigned { + MATRIX_SCALE_ROW0 = 0, + MATRIX_SCALE_ROW1 = 1, +}; + +enum MatrixScaleFmt : unsigned { + MATRIX_SCALE_FMT_E8 = 0, + MATRIX_SCALE_FMT_E5M3 = 1, + MATRIX_SCALE_FMT_E4M3 = 2 +}; } // namespace WMMA namespace VOP3PEncoding { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e934152..0c653b1 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1169,11 +1169,18 @@ void SIFoldOperandsImpl::foldOperand( // Grab the use operands first SmallVector<MachineOperand *, 4> UsesToProcess( llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg))); - for (auto *RSUse : UsesToProcess) { + for (unsigned I = 0; I != UsesToProcess.size(); ++I) { + MachineOperand *RSUse = UsesToProcess[I]; MachineInstr *RSUseMI = RSUse->getParent(); unsigned OpNo = RSUseMI->getOperandNo(RSUse); if (SplatRC) { + if (RSUseMI->isCopy()) { + Register DstReg = RSUseMI->getOperand(0).getReg(); + append_range(UsesToProcess, + make_pointer_range(MRI->use_nodbg_operands(DstReg))); + continue; + } if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { FoldableDef SplatDef(SplatVal, SplatRC); appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index a3e20ba..c552f1a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -908,6 +908,32 @@ def SupportedRoundMode : TImmLeaf<i32, [{ Imm == (int)RoundingMode::TowardNegative; }]>; +def VOP3PModsNeg : SDNodeXForm<timm, [{ + unsigned Mods = SISrcMods::OP_SEL_1; + if (N->getZExtValue()) + Mods ^= SISrcMods::NEG; + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + +def VOP3PModsNegs : SDNodeXForm<timm, [{ + unsigned Mods = SISrcMods::OP_SEL_1; + if (N->getZExtValue()) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + +def VOP3PModsNegAbs : SDNodeXForm<timm, [{ + unsigned Val = N->getZExtValue(); + unsigned Mods = SISrcMods::OP_SEL_1; // default: none + if (Val == 1) // neg + Mods ^= SISrcMods::NEG; + if (Val == 2) // abs + Mods ^= SISrcMods::ABS; + if (Val == 3) // neg and abs + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{ uint64_t Imm = N->getZExtValue(); unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1; @@ -1310,6 +1336,12 @@ def bitop3_0 : DefaultOperand<BitOp3, 0>; def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">; def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">; +def MatrixAScale : CustomOperand<i32, 1, "MatrixAScale">; +def MatrixBScale : CustomOperand<i32, 1, "MatrixBScale">; + +def MatrixAScaleFmt : CustomOperand<i32, 1, "MatrixAScaleFmt">; +def MatrixBScaleFmt : CustomOperand<i32, 1, "MatrixBScaleFmt">; + def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; @@ -1647,9 +1679,6 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; -def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">; -def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern? -def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">; @@ -1774,6 +1803,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, @@ -1924,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } @@ -2678,6 +2709,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasNeg = HasModifiers; field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; + field bit HasMatrixScale = 0; + field bit HasMatrixReuse = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); @@ -2935,6 +2968,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; +def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>; +def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>; +def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>; def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; @@ -2948,6 +2984,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; +def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>; +def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>; def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; @@ -2955,11 +2993,26 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>; def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>; def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>; def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>; +def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>; +def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>; +def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>; +def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>; +def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>; +def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>; def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>; +def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>; +def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>; +def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>; +def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>; +def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>; +def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>; +def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>; +def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>; +def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 54fa192..bd5dfa9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3543,14 +3543,21 @@ def : GCNPat < (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; -} def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))), (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) >; +} // End True16Predicate = ... } // End foreach Ty = ... -} +} // End AddedComplexity = 1 + +let True16Predicate = UseRealTrue16Insts in +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, + (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), hi16) +>; let SubtargetPredicate = HasVOP3PInsts in { foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in @@ -3599,7 +3606,11 @@ def : GCNPat < >; def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))), - (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16) + (REG_SEQUENCE VGPR_32, $src0, lo16, (Ty (IMPLICIT_DEF)), hi16) +>; +def : GCNPat < + (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_16:$src1))), + (REG_SEQUENCE VGPR_32, (Ty (IMPLICIT_DEF)), lo16, (Ty VGPR_16:$src1), hi16) >; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 36d1a3b..08d07c9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; // True 16 Operands def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index f621f85..b128207 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -107,18 +107,6 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : VOP_DPP_Pseudo <OpName, P, pattern> { } -class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies { - list<dag> ret = - !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))], - !if(P.HasOMod, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))] - ) - ); -} - multiclass VOP1Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag, int VOPDOp = -1> { // We only want to set this on the basic, non-SDWA or DPP forms. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 19ce7f5..f4b6af6 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no } } +let HasExtVOP3DPP = 0, HasModifiers = 0 in { +def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>; +def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>; +def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>; +} + let Src0RC64 = VSrc_NoInline_v2f16 in { def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; @@ -1771,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>; + defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>; + defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>; + defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>; + defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>; + defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>; } // End Constraints = "@earlyclobber $vdst" defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; @@ -1778,6 +1790,44 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; } // End ReadsModeReg = 0 + let Constraints = "@earlyclobber $vdst" in { + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>; + defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>; + defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>; + defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>; + defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>; + defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>; + defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>; + defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>; + defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>; + + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>; + defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>; + } // End Constraints = "@earlyclobber $vdst" + let True16Predicate = UseRealTrue16Insts in { def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; @@ -1788,6 +1838,12 @@ let SubtargetPredicate = isGFX1250Plus in { } } // End SubtargetPredicate = isGFX1250Plus +let SubtargetPredicate = HasTensorCvtLutInsts in { + defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>; + defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>; + defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>; +} // End SubtargetPredicate = HasTensorCvtLutInsts + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -2186,6 +2242,9 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; +defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; +defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; @@ -2198,6 +2257,42 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; +defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>; +defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>; +defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>; +defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>; +defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>; +defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>; +defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; +defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; +defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; +defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>; +defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>; +defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>; +defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>; +defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>; +defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>; +defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>; +defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>; +defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>; +defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>; +defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>; +defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>; +defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>; +defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>; +defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>; +defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>; +defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 95fcd4a..9264935 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -557,11 +557,11 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { null_frag, 1>; // Dot-iu instructions consider input as signed if imod neg bits are set. Thus // Dot-iu Intrinsics have extra operands and require separate codegen pattern. - def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0, - (VOP3PModsNeg i32:$src1_mods), i32:$src1, + def : GCNPat < (intrinsic_node timm:$src0_mods, i32:$src0, + timm:$src1_mods, i32:$src1, i32:$src2, (i1 timm:$clamp)), - (!cast<Instruction>(NAME) $src0_mods, i32:$src0, - $src1_mods, i32:$src1, + (!cast<Instruction>(NAME) (VOP3PModsNeg $src0_mods), i32:$src0, + (VOP3PModsNeg $src1_mods), i32:$src1, (i32 8), i32:$src2, i1:$clamp) >; } @@ -1302,11 +1302,11 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : GCNPat < (P.DstVT (node - (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), - (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), + timm:$src0_modifiers, (P.Src0VT P.Src0VT:$src0), + timm:$src1_modifiers, (P.Src1VT P.Src1VT:$src1), (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) )), - (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) + (P.DstVT (Inst (VOP3PModsNeg $src0_modifiers), P.Src0VT:$src0, (VOP3PModsNeg $src1_modifiers), P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) >; class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { @@ -1407,9 +1407,9 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, - bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0, - bit _IsF4 = 0> + bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, + bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B @@ -1417,6 +1417,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, int IndexType = _IndexType; let HasMatrixFMT = _HasMatrixFMT; + let HasMatrixScale = _HasMatrixScale; + bit Scale16 = _Scale16; let HasMatrixReuse = _HasMatrixReuse; bit HasIModOp = _Has_ImodOp; @@ -1455,6 +1457,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsC_F16: "_f16", IsC_BF16: "_bf16", 1: "_b32"))); + ValueType ScaleTy = !if(Scale16, i64, i32); // For f16 and bf16 matrices A and B, each element can be modified by // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but @@ -1516,6 +1519,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), (ins)); + dag MatrixScaleSrc = !if(HasMatrixScale, + !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1), + (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)), + (ins)); + dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), + (ins)); dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), @@ -1529,7 +1539,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, (ins VRegSrc_64:$src2), (ins VRegSrc_32:$src2)), IndexKey)), - MatrixFMT, MatrixReuse, Clamp, Neg); + MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg); // asm @@ -1538,57 +1548,59 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 16) : "$index_key_16bit", !eq(IndexType, 32) : "$index_key_32bit"); string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", ""); + string MatrixScaleSrcAsm = !if(HasMatrixScale, ", $scale_src0, $scale_src1", ""); + string MatrixScaleAsm = !if(HasMatrixScale, "$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt", ""); string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixScaleSrcAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp)); bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp); bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp); - dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0), + dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0), IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), IsAB_BF16_IMod0 : (ins Src0VT:$src0), - IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsIU : (ins timm:$src0_modifiers, Src0VT:$src0), HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0), IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0), IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0), - IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1), + dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1), IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), IsAB_BF16_IMod0 : (ins Src1VT:$src1), - IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsIU : (ins timm:$src1_modifiers, Src1VT:$src1), HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); - dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src1_modifiers), Src1VT:$src1), IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1), IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), - IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32)); bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp)); bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); bit IsIUXF32 = !or(IsIU, IsXF32); - dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2), + dag Src2InPatWmma = !cond(IsC_IMod1 : (ins timm:$src2_modifiers, Src2VT:$src2), IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_BF16_IMod0 : (ins Src2VT:$src2), IsIUXF32 : (ins Src2VT:$src2), IsSWMMAC : (ins)); - dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2), + dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2), IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), @@ -1604,22 +1616,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins)); - dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); - dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); + dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0, + timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1), + (ins)); dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixScaleOutSrcPat = !if(HasMatrixScale, (ins ScaleTy:$scale_src0, ScaleTy:$scale_src1), (ins)); + dag MatrixScaleOutModPat = !if(HasMatrixScale, (ins i32:$matrix_a_scale, i32:$matrix_b_scale, i32:$matrix_a_scale_fmt, i32:$matrix_b_scale_fmt), (ins)); dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat, + MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, + MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); } def WMMAInstInfoTable : GenericTable { @@ -1645,11 +1664,15 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; + let FixedSize = WMMAProfile.HasMatrixScale; + let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; + let FixedSize = WMMAProfile.HasMatrixScale; + let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); } } @@ -1728,39 +1751,53 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes -def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>; -def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>; -def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>; -def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>; -def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>; -def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>; -def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>; -def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>; - -multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> { - def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; -} - -defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>; + +multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> { + def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; +} + +defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>; +defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>; +defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>; + +class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> { + let HasMatrixScale = 1; + let HasMatrixReuse = 1; + let HasNeg = 0; + let Src0RC64 = RC; + let Src1RC64 = RC; + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt, + MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse); + let AsmVOP3P = " $src0, $src1$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt$matrix_a_reuse$matrix_b_reuse"; +} multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> { foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { @@ -1813,9 +1850,12 @@ defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64 defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">; defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">; - +defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">; +defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">; } // End is_wmma_xdl = 1. +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>; } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -1973,6 +2013,8 @@ let SubtargetPredicate = isGFX125xOnly in { foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>; + defm : WMMAPat<"V_WMMA_SCALE_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE_" # I # "_w32")>; + defm : WMMAPat<"V_WMMA_SCALE16_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE16_" # I # "_w32")>; } def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; @@ -2105,6 +2147,73 @@ multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> { } } +class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VOP3Pe_Base { + bits<9> scale_src0; + bits<9> scale_src1; + + // Inst{7-0} = unused + let Inst{10-8} = {0, matrix_b_scale_fmt{1-0}}; // neg_hi + let Inst{11} = matrix_a_scale{0}; // scale_op_sel(0) + let Inst{12} = 0; // scale_op_sel(1) + let Inst{13} = matrix_a_reuse; // scale_op_sel(2) + let Inst{14} = matrix_b_reuse; // scale_op_sel_hi(2) + let Inst{15} = 0; // scale_clamp + let Inst{31-24} = 0xcc; // Encoding + let Inst{23-16} = LdScaleOp; + let Inst{40-32} = scale_src0; + let Inst{49-41} = scale_src1; + let Inst{58-50} = 0; // scale src2 + let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0) + let Inst{60} = 0; // scale_op_sel_hi(1) + let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo) + + // The high half of the encoding is the unscaled wmma op. + let Inst{71-64} = vdst; + + let Inst{72} = !if(P.NegHi01, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{73} = !if(P.NegHi01, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{74} = !if(P.NegHi2, src2_modifiers{1}, 0); // neg_hi src2 + + let Inst{77-75} = !if(P.HasMatrixFMT, matrix_a_fmt{2-0}, 0); // op_sel + + let Inst{78,124,123} = !if(P.HasMatrixFMT, matrix_b_fmt{2-0}, 7); // op_sel_hi + let Inst{79} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{87-80} = op; + let Inst{95-88} = 0xcc; //encoding + let Inst{104-96} = !if(P.HasSrc0, src0, 0); + let Inst{113-105} = !if(P.HasSrc1, src1, 0); + let Inst{122-114} = !if(P.HasSrc2, src2, 0); + + // neg_lo + let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0); + let Inst{126} = !if(P.NegLo01, src1_modifiers{0}, 0); + let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0); +} + +multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); + defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, + DecoderNamespace = "GFX1250" in { + def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>, + VOP3PX2e <op, LdScaleOp, WMMAP>, + MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> { + let AsmString = asmName # PS.AsmOperands; + } + } +} + +multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> { + defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; + foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { + let isAsmParserOnly = true in { // Disable ambiguous disassembly. + defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; + } + } +} + defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; @@ -2180,6 +2289,8 @@ defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8B defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">; +defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">; +defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">; defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; @@ -2283,6 +2394,9 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; + let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f027ab0..3cad5a1 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -475,17 +475,24 @@ class VOP3Pe_Base { bits<1> index_key_32bit; bits<3> matrix_a_fmt; bits<3> matrix_b_fmt; + bits<1> matrix_a_scale; + bits<1> matrix_b_scale; + bits<2> matrix_a_scale_fmt; + bits<2> matrix_b_scale_fmt; bits<1> matrix_a_reuse; bits<1> matrix_b_reuse; } class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{7-0} = !if(P.HasDst, vdst, 0); - let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 - let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{0}, 0)); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{1}, 0)); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 - let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, + !if(P.HasMatrixScale, matrix_a_scale{0}, 0)); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2) @@ -500,10 +507,17 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1) - let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) - let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3}, + P.IsDOT : 1, + P.HasMatrixScale : matrix_b_scale{0}, + 1: ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, + !if(P.HasMatrixScale, 0, + !if(P.IsDOT, 1, ?))); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{0}, 0)); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{1}, 0)); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9366256..7f8b446 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -669,13 +669,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, // Integer division functions // RTABI chapter 4.3.1 - { RTLIB::SDIV_I8, RTLIB::__aeabi_idiv__i8 }, - { RTLIB::SDIV_I16, RTLIB::__aeabi_idiv__i16 }, - { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv__i32}, + { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv }, { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod }, - { RTLIB::UDIV_I8, RTLIB::__aeabi_uidiv__i8 }, - { RTLIB::UDIV_I16, RTLIB::__aeabi_uidiv__i16 }, - { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv__i32 }, + { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv }, { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod }, }; // clang-format on diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index e8d0d35..fedf9e2 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -121,10 +121,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return std::make_unique<ARMElfTargetObjectFile>(); } -static std::string computeDataLayout(const Triple &TT, StringRef CPU, +static std::string computeDataLayout(const Triple &TT, const TargetOptions &Options, bool isLittle) { - auto ABI = ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName); + auto ABI = ARM::computeTargetABI(TT, Options.MCOptions.ABIName); std::string Ret; if (isLittle) @@ -202,11 +202,10 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool isLittle) - : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options, isLittle), - TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + : CodeGenTargetMachineImpl(T, computeDataLayout(TT, Options, isLittle), TT, + CPU, FS, Options, getEffectiveRelocModel(TT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TargetABI(ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName)), + TargetABI(ARM::computeTargetABI(TT, Options.MCOptions.ABIName)), TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) { // Default to triple-appropriate float ABI diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index dfa3de3c..cc1c79b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -296,9 +296,9 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym, unsigned FixupKind) { // Create relocations for unconditional branches to function symbols with // different execution mode in ELF binaries. - if (!Sym || !Sym->isELF()) + if (!Sym || !Asm.getContext().isELF()) return false; - unsigned Type = cast<MCSymbolELF>(Sym)->getType(); + unsigned Type = static_cast<const MCSymbolELF *>(Sym)->getType(); if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) { if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch)) return true; @@ -1108,9 +1108,8 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F, } void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -1124,14 +1123,15 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. const unsigned NumBytes = getFixupKindNumBytes(Kind); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FullSizeBytes; if (Endian == llvm::endianness::big) { FullSizeBytes = getFixupKindContainerSizeBytes(Kind); - assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FullSizeBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); } @@ -1141,7 +1141,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (FullSizeBytes - 1 - i); - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 07d2cf7..2844232 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -40,8 +40,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 50e9ca1..d914f6e 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -97,8 +97,8 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup, case ARM::S_TLSLDM_FDPIC: case ARM::S_TLSLDO: case ARM::S_TPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 6dfe846..0796746 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -614,7 +614,7 @@ public: if (!IsThumb) return Val; - unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); + unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType(); if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) && Symbol->isDefined()) getAssembler().setIsThumbFunc(Symbol); @@ -679,7 +679,8 @@ private: } void EmitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); @@ -687,7 +688,8 @@ private: } void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabelAtPos(Symbol, SMLoc(), F, Offset); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); @@ -1088,7 +1090,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { return; Streamer.getAssembler().registerSymbol(*Symbol); - unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); + unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType(); if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) emitThumbFunc(Symbol); } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 354de8f..8ee3a2d 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -505,7 +505,7 @@ public: // Remember that the function is a thumb function. Fixup and relocation // values will need adjusted. getStreamer().getAssembler().setIsThumbFunc(Symbol); - cast<MCSymbolMachO>(Symbol)->setThumbFunc(); + static_cast<MCSymbolMachO *>(Symbol)->setThumbFunc(); } }; } // namespace diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 38444f9..05a7d03 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -368,9 +368,8 @@ AVRAsmBackend::createObjectTargetWriter() const { } void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // AVR sets the fixup value to bypass the assembly time overflow with a // relocation. if (IsResolved) { @@ -397,14 +396,14 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i < NumBytes; ++i) { uint8_t mask = (((Value >> (i * 8)) & 0xff)); - Data[Offset + i] |= mask; + Data[i] |= mask; } } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index 68c839e..9633669 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -38,8 +38,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index dda8753..53933f9 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -27,8 +27,7 @@ public: ~BPFAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -66,35 +65,32 @@ bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (Fixup.getKind() == FK_SecRel_8) { // The Value is 0 for global variables, and the in-section offset // for static variables. Write to the immediate field of the inst. assert(Value <= UINT32_MAX); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], - static_cast<uint32_t>(Value), + support::endian::write<uint32_t>(Data + 4, static_cast<uint32_t>(Value), Endian); } else if (Fixup.getKind() == FK_Data_4 && !Fixup.isPCRel()) { - support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint32_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_8) { - support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint64_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_4 && Fixup.isPCRel()) { Value = (uint32_t)((Value - 8) / 8); if (Endian == llvm::endianness::little) { - Data[Fixup.getOffset() + 1] = 0x10; - support::endian::write32le(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x10; + support::endian::write32le(Data + 4, Value); } else { - Data[Fixup.getOffset() + 1] = 0x1; - support::endian::write32be(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x1; + support::endian::write32be(Data + 4, Value); } } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) { // The input Value represents the number of bytes. Value = (uint32_t)((Value - 8) / 8); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value, - Endian); + support::endian::write<uint32_t>(Data + 4, Value, Endian); } else { assert(Fixup.getKind() == FK_Data_2 && Fixup.isPCRel()); @@ -103,8 +99,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, report_fatal_error("Branch target out of insn range"); Value = (uint16_t)((Value - 8) / 8); - support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value, - Endian); + support::endian::write<uint16_t>(Data + 2, Value, Endian); } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index 1bd82fad..6964998 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -197,9 +197,8 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F, } void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -217,10 +216,10 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. @@ -228,14 +227,14 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, bool IsInstFixup = (Kind >= FirstTargetFixupKind); if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) { - Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff); - Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff); - Data[Offset + 2] |= uint8_t(Value & 0xff); - Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff); + Data[0] |= uint8_t((Value >> 16) & 0xff); + Data[1] |= uint8_t((Value >> 24) & 0xff); + Data[2] |= uint8_t(Value & 0xff); + Data[3] |= uint8_t((Value >> 8) & 0xff); } else { for (unsigned I = 0; I != NumBytes; I++) { unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I); - Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (I * 8)) & 0xff); } } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h index 1c8516f..5d8826a 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h @@ -25,8 +25,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index d042d26..4667975f 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -48,8 +48,8 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup, case CSKY::S_TLSGD: case CSKY::S_TLSLDM: case CSKY::S_TLSLDO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp index 346b123..397cf16 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp @@ -169,7 +169,8 @@ void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) { State = (Name == "$t" ? EMS_Text : EMS_Data); - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index b6e8ce7..26a113d 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -103,7 +103,7 @@ GlobalVariable *DXContainerGlobals::computeShaderHash(Module &M) { dxbc::ShaderHash HashData = {0, {0}}; // The Hash's IncludesSource flag gets set whenever the hashed shader includes // debug information. - if (M.debug_compile_units_begin() != M.debug_compile_units_end()) + if (!M.debug_compile_units().empty()) HashData.Flags = static_cast<uint32_t>(dxbc::HashFlags::IncludesSource); memcpy(reinterpret_cast<void *>(&HashData.Digest), Result.data(), 16); diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 5323be6..9a14c01 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -78,8 +78,7 @@ public: ~DXILAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 102f1c6..14b6bb3 100644 --- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -330,7 +330,7 @@ bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) { if (!GepI->getType()->isPointerTy()) return false; // No GEPs without any indices. (Is this possible?) - if (GepI->idx_begin() == GepI->idx_end()) + if (GepI->indices().empty()) return false; return true; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 52fa678..613048b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1987,7 +1987,7 @@ SmallVector<uint32_t, 8> HvxSelector::getPerfectCompletions(ShuffleMask SM, // times). In such cases it will be impossible to complete this to a // perfect shuffle. SmallVector<uint32_t, 8> Sorted(Worklist); - llvm::sort(Sorted.begin(), Sorted.end()); + llvm::sort(Sorted); for (unsigned I = 0, E = Sorted.size(); I != E;) { unsigned P = Sorted[I], Count = 1; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index d5b7a75..1a0f1ab 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -402,8 +402,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t FixupValue, - bool IsResolved) override; + uint8_t *Data, uint64_t FixupValue, bool IsResolved) override; bool isInstRelaxable(MCInst const &HMI) const { const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI); @@ -649,8 +648,7 @@ public: } // namespace void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *InstAddr, uint64_t FixupValue, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup)) IsResolved = false; @@ -667,10 +665,9 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // LLVM gives us an encoded value, we have to convert it back // to a real offset before we can use it. - uint32_t Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); - char *InstAddr = Data.data() + Offset; + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); Value = adjustFixupValue(Kind, FixupValue); if (!Value) @@ -757,8 +754,8 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8)); dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x"; - dbgs().write_hex(FixupValue) - << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x"; + dbgs().write_hex(FixupValue) << ": Offset=" << Fixup.getOffset() + << ": Size=" << F.getSize() << ": OInst=0x"; dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc);); // For each byte of the fragment that the fixup touches, mask in the diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index 9752f3a..af97ea2 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -50,8 +50,8 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup, case HexagonMCExpr::VK_IE: case HexagonMCExpr::VK_IE_GOT: case HexagonMCExpr::VK_TPREL: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 13ecc23..039ef4f 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -96,7 +96,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, getAssembler().registerSymbol(*Symbol); StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"}; - auto ELFSymbol = cast<MCSymbolELF>(Symbol); + auto ELFSymbol = static_cast<MCSymbolELF *>(Symbol); if (!ELFSymbol->isBindingSet()) ELFSymbol->setBinding(ELF::STB_GLOBAL); @@ -143,7 +143,7 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, Align ByteAlignment, unsigned AccessSize) { getAssembler().registerSymbol(*Symbol); - auto ELFSymbol = cast<MCSymbolELF>(Symbol); + auto ELFSymbol = static_cast<const MCSymbolELF *>(Symbol); ELFSymbol->setBinding(ELF::STB_LOCAL); ELFSymbol->setExternal(false); HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize); diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 83d1697..3112dea 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -48,8 +48,7 @@ public: : MCAsmBackend(llvm::endianness::big), OSType(OST) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -72,9 +71,8 @@ bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -85,7 +83,6 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Where in the object and where the number of bytes that need // fixing up - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; unsigned FullSize = 4; @@ -95,8 +92,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Load instruction and apply value for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx])) - << (i * 8); + CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Idx])) << (i * 8); } uint64_t Mask = @@ -106,7 +102,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write out the fixed up bytes back to the code/data bits. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); + Data[Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 858f3d0..fda9d97 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -131,19 +131,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, - MutableArrayRef<char> Data, uint64_t Value) { +static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data, + uint64_t Value) { unsigned I; - for (I = 0; I != Data.size() && Value; ++I, Value >>= 7) + for (I = 0; Value; ++I, Value >>= 7) Data[I] |= uint8_t(Value & 0x7f); if (Value) Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!"); } void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); @@ -166,14 +165,14 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) { - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } } @@ -274,15 +273,14 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) return false; - bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); - assert(IsAbsolute && "CFA with invalid expression"); - (void)IsAbsolute; + [[maybe_unused]] bool IsAbsolute = + AddrDelta.evaluateKnownAbsolute(Value, *Asm); + assert(IsAbsolute); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -293,33 +291,23 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> FK; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64); + PCBytes = PtrSize; + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PtrSize + 1) + << uint8_t(dwarf::DW_LNE_set_address); OS.write_zeros(PtrSize); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - FK = getRelocPairForSize(16); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -330,7 +318,8 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 3d929fc..1f13601 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -42,8 +42,7 @@ public: uint64_t &FixedValue, bool IsResolved); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index fb741af..7e021e4 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -61,8 +61,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_LARCH_TLS_LD_PCREL20_S2: case ELF::R_LARCH_TLS_GD_PCREL20_S2: case ELF::R_LARCH_TLS_DESC_PCREL20_S2: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index 7ef705d..fe83dc6 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -53,8 +53,7 @@ public: .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -78,9 +77,8 @@ public: } // end anonymous namespace void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -95,8 +93,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write in Big Endian for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = - uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); + Data[i] = uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); } /// cc—Carry clear GE—Greater than or equal diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp index ca94a47..d070409 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp @@ -70,8 +70,8 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup, case M68k::S_TLSLD: case M68k::S_TLSLDM: case M68k::S_TPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index b513503..d892b3a 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -36,8 +36,7 @@ public: ~MSP430AsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { @@ -105,9 +104,8 @@ uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup, } void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); Value = adjustFixupValue(Fixup, Value, getContext()); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -117,15 +115,14 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 259b71b..7b2ee83 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2948,8 +2948,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool IsPtr64 = ABI.ArePtrs64bit(); bool IsLocalSym = Res.getAddSym()->isInSection() || Res.getAddSym()->isTemporary() || - (Res.getAddSym()->isELF() && - cast<MCSymbolELF>(Res.getAddSym())->getBinding() == ELF::STB_LOCAL); + (getContext().isELF() && + static_cast<const MCSymbolELF *>(Res.getAddSym())->getBinding() == + ELF::STB_LOCAL); // For O32, "$"-prefixed symbols are recognized as temporary while // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L" // manually. @@ -6653,7 +6654,7 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) { llvm_unreachable("Should never fail"); } } - } else if (Sym->isUnset()) { + } else if (Sym->isUndefined()) { // If symbol is unset, it might be created in the `parseSetAssignment` // routine as an alias for a numeric register name. // Lookup in the aliases list. diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index c2169be..33aab71 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -283,9 +283,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { /// data fragment, at the offset specified by the fixup and following the /// fixup kind as appropriate. void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -297,7 +296,6 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. // Where do we start in the object - unsigned Offset = Fixup.getOffset(); // Number of bytes we need to fixup unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; // Used to point to big endian bytes @@ -328,7 +326,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8); + CurVal |= (uint64_t)((uint8_t)Data[Idx]) << (i * 8); } uint64_t Mask = ((uint64_t)(-1) >> @@ -340,7 +338,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff); + Data[Idx] = (uint8_t)((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 816626d..40b5853 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -40,8 +40,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 7abe9c9..16247bd 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -166,8 +166,8 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup, case Mips::S_GOTTPREL: case Mips::S_TPREL_HI: case Mips::S_TPREL_LO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -450,6 +450,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, needsRelocateWithSymbol(V, (Type >> 8) & 0xff) || needsRelocateWithSymbol(V, (Type >> 16) & 0xff); + auto *Sym = static_cast<const MCSymbolELF *>(V.getAddSym()); switch (Type) { default: errs() << Type << "\n"; @@ -481,7 +482,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, // FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but // we neglect to handle the adjustment to the LSB of the addend that // it causes in applyFixup() and similar. - if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS) + if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS) return true; return false; @@ -492,7 +493,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, case ELF::R_MIPS_16: case ELF::R_MIPS_32: case ELF::R_MIPS_GPREL32: - if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS) + if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS) return true; [[fallthrough]]; case ELF::R_MIPS_26: diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index e8b9746..feeadc5e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -76,7 +76,7 @@ void MipsELFStreamer::createPendingLabelRelocs() { // FIXME: Also mark labels when in MIPS16 mode. if (ELFTargetStreamer->isMicroMipsEnabled()) { for (auto *L : Labels) { - auto *Label = cast<MCSymbolELF>(L); + auto *Label = static_cast<MCSymbolELF *>(L); getAssembler().registerSymbol(*Label); Label->setOther(ELF::STO_MIPS_MICROMIPS); } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index d9680c7..5df70c4 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -931,7 +931,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, } void MipsTargetELFStreamer::emitLabel(MCSymbol *S) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); getStreamer().getAssembler().registerSymbol(*Symbol); uint8_t Type = Symbol->getType(); if (Type != ELF::STT_FUNC) @@ -1015,11 +1015,11 @@ void MipsTargetELFStreamer::finish() { } void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // If on rhs is micromips symbol then mark Symbol as microMips. if (Value->getKind() != MCExpr::SymbolRef) return; - const auto &RhsSym = cast<MCSymbolELF>( + auto &RhsSym = static_cast<const MCSymbolELF &>( static_cast<const MCSymbolRefExpr *>(Value)->getSymbol()); if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS)) @@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64. S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(8, 0); @@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_DTPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_DTPREL64); S.appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_TPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_TPREL64); S.appendContents(8, 0); } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index a2e48ab..4530fc6 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1052,8 +1052,7 @@ void MipsAsmPrinter::EmitFPCallStub( // __call_stub_fp_xxxx: // std::string x = "__call_stub_fp_" + std::string(Symbol); - MCSymbolELF *Stub = - cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x))); + MCSymbol *Stub = OutContext.getOrCreateSymbol(StringRef(x)); TS.emitDirectiveEnt(*Stub); MCSymbol *MType = OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 2ae7520..6765ecb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -151,6 +151,8 @@ class OneUse2<SDPatternOperator operator> class fpimm_pos_inf<ValueType vt> : FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>; +class zeroinitializer<ValueType vt> : + PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>; // Operands which can hold a Register or an Immediate. @@ -789,6 +791,23 @@ def UMAX16x2 : I16x2<"max.u", umax>; def SMIN16x2 : I16x2<"min.s", smin>; def UMIN16x2 : I16x2<"min.u", umin>; +let Predicates = [hasPTX<80>, hasSM<90>] in { + + def MIN_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "min.relu.s32", + [(set i32:$dst, (smax (smin i32:$a, i32:$b), 0))]>; + def MAX_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "max.relu.s32", + [(set i32:$dst, (smax (smax i32:$a, i32:$b), 0))]>; + def MIN_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "min.relu.s16x2", + [(set v2i16:$dst, (smax (smin v2i16:$a, v2i16:$b), + zeroinitializer<v2i16>))]>; + def MAX_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "max.relu.s16x2", + [(set v2i16:$dst, (smax (smax v2i16:$a, v2i16:$b), + zeroinitializer<v2i16>))]>; +} // // Wide multiplication @@ -2379,9 +2398,6 @@ def fpimm_any_zero : FPImmLeaf<fAny, [{ return Imm.isZero(); }]>; -def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>; -def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>; - // Perform substitution if fma only has one use, and also if instruction has // nnan instruction flag or if the TM has NoNaNsFPMath def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c), @@ -2404,10 +2420,10 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat> let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in { def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>; - def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, fpimm_positive_zero_v2f16>; + def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>; } let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in { def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>; - def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, fpimm_positive_zero_v2bf16>; + def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>; } diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 58766b1..1fc475d 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1756,7 +1756,7 @@ bool PPCAsmParser::parseDirectiveLocalEntry(SMLoc L) { if (getParser().parseIdentifier(Name)) return Error(L, "expected identifier in '.localentry' directive"); - MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name)); + auto *Sym = static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(Name)); const MCExpr *Expr; if (parseToken(AsmToken::Comma) || diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 0e8828f..04b886a 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -13,6 +13,7 @@ #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" @@ -93,8 +94,8 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsResolved) override; + const MCValue &Target, uint8_t *Data, uint64_t Value, + bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) { // If there is a @ specifier, unless it is optimized out (e.g. constant @l), @@ -112,14 +113,15 @@ public: // to resolve the fixup directly. Emit a relocation and leave // resolution of the final target address to the linker. if (const auto *A = Target.getAddSym()) { - if (const auto *S = dyn_cast<MCSymbolELF>(A)) { + if (getContext().isELF()) { // The "other" values are stored in the last 6 bits of the second // byte. The traditional defines for STO values assume the full byte // and thus the shift to pack it. - unsigned Other = S->getOther() << 2; + unsigned Other = static_cast<const MCSymbolELF *>(A)->getOther() << 2; if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) return true; - } else if (const auto *S = dyn_cast<MCSymbolXCOFF>(A)) { + } else if (getContext().isXCOFF()) { + auto *S = static_cast<const MCSymbolXCOFF *>(A); return !Target.isAbsolute() && S->isExternal() && S->getStorageClass() == XCOFF::C_WEAKEXT; } @@ -185,9 +187,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &TargetVal, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &TargetVal, uint8_t *Data, + uint64_t Value, bool IsResolved) { // In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to // reference the null symbol. auto Target = TargetVal; @@ -205,7 +206,6 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); // For each byte of the fragment that the fixup touches, mask in the bits @@ -213,7 +213,7 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1 - i); - Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (Idx * 8)) & 0xff); } } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index a5d3be4..329ad6e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -86,8 +86,8 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup, case PPC::S_TPREL_HIGHEST: case PPC::S_TPREL_HIGHESTA: case PPC::S_TPREL_LO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -499,7 +499,8 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, // The "other" values are stored in the last 6 bits of the second byte. // The traditional defines for STO values assume the full byte and thus // the shift to pack it. - unsigned Other = cast<MCSymbolELF>(V.getAddSym())->getOther() << 2; + unsigned Other = + static_cast<const MCSymbolELF *>(V.getAddSym())->getOther() << 2; return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp index 2dbc31f..132d5a4 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp @@ -65,7 +65,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst, MCFragment *InstructionFragment = getCurrentFragment(); SMLoc InstLoc = Inst.getLoc(); // Check if there was a last label emitted. - if (LastLabel && !LastLabel->isUnset() && LastLabelLoc.isValid() && + if (LastLabel && LastLabel->isDefined() && LastLabelLoc.isValid() && InstLoc.isValid()) { const SourceMgr *SourceManager = getContext().getSourceManager(); unsigned InstLine = SourceManager->FindLineNumber(InstLoc); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 3dad0e8..d856c3f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -211,7 +211,7 @@ public: : PPCTargetStreamer(S), OS(OS) {} void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override { - if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) { + if (getContext().isXCOFF()) { MCSymbolXCOFF *TCSym = static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly()) ->getQualNameSymbol(); @@ -225,10 +225,10 @@ public: if (Kind == PPC::S_AIX_TLSGD || Kind == PPC::S_AIX_TLSGDM || Kind == PPC::S_AIX_TLSIE || Kind == PPC::S_AIX_TLSLE || Kind == PPC::S_AIX_TLSLD || Kind == PPC::S_AIX_TLSML) - OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@" + OS << "\t.tc " << TCSym->getName() << "," << S.getName() << "@" << getContext().getAsmInfo()->getSpecifierName(Kind) << '\n'; else - OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n'; + OS << "\t.tc " << TCSym->getName() << "," << S.getName() << '\n'; if (TCSym->hasRename()) Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName()); @@ -308,7 +308,7 @@ public: } void emitAssignment(MCSymbol *S, const MCExpr *Value) override { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // When encoding an assignment to set symbol A to symbol B, also copy // the st_other bits encoding the local entry point offset. @@ -335,7 +335,7 @@ private: auto *Ref = dyn_cast<const MCSymbolRefExpr>(S); if (!Ref) return false; - const auto &RhsSym = cast<MCSymbolELF>(Ref->getSymbol()); + auto &RhsSym = static_cast<const MCSymbolELF &>(Ref->getSymbol()); unsigned Other = D->getOther(); Other &= ~ELF::STO_PPC64_LOCAL_MASK; Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK; diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index ce1d51a..2ab2c14 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2155,7 +2155,8 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp); + TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym), + LocalOffsetExp); } else if (Subtarget->isUsingPCRelativeCalls()) { // When generating the entry point for a function we have a few scenarios // based on whether or not that function uses R2 and whether or not that @@ -2182,7 +2183,7 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() { MF->hasInlineAsm() || (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), + TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym), MCConstantExpr::create(1, OutContext)); } } @@ -2766,7 +2767,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { if (GV->hasComdat()) report_fatal_error("COMDAT not yet supported by AIX."); - MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV)); + auto *GVSym = static_cast<MCSymbolXCOFF *>(getSymbol(GV)); if (GV->isDeclarationForLinker()) { emitLinkage(GV, GVSym); @@ -2859,7 +2860,7 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { MCSectionSubPair Current = OutStreamer->getCurrentSection(); // Emit function descriptor. OutStreamer->switchSection( - cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect()); + static_cast<MCSymbolXCOFF *>(CurrentFnDescSym)->getRepresentedCsect()); // Emit aliasing label for function descriptor csect. for (const GlobalAlias *Alias : GOAliasMap[&MF->getFunction()]) @@ -2994,7 +2995,8 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { SmallString<128> Name; StringRef Prefix = "."; Name += Prefix; - Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName(); + Name += static_cast<const MCSymbolXCOFF *>(I.first.first) + ->getSymbolTableName(); MCSymbol *S = OutContext.getOrCreateSymbol(Name); TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(S, TM)); @@ -3112,7 +3114,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { setCsectAlignment(&G); std::optional<CodeModel::Model> OptionalCodeModel = G.getCodeModel(); if (OptionalCodeModel) - setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&G)), + setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&G)), *OptionalCodeModel); } @@ -3139,7 +3141,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { if (GVar) { std::optional<CodeModel::Model> OptionalCodeModel = GVar->getCodeModel(); if (OptionalCodeModel) - setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&Alias)), + setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&Alias)), *OptionalCodeModel); } @@ -3190,8 +3192,8 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::BL_NOP: { const MachineOperand &MO = MI->getOperand(0); if (MO.isSymbol()) { - MCSymbolXCOFF *S = - cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName())); + auto *S = static_cast<MCSymbolXCOFF *>( + OutContext.getOrCreateSymbol(MO.getSymbolName())); ExtSymSDNodeSymbols.insert(S); } } break; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index f179873..30b5fd6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. if (Subtarget.useCRBits()) { - setHasMultipleConditionRegisters(); setJumpIsExpensive(); } @@ -5540,8 +5539,8 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) { const TargetMachine &TM = Subtarget.getTargetMachine(); const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering(); - MCSymbolXCOFF *S = - cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM)); + auto *S = + static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(GV, TM)); MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); return DAG.getMCSymbol(S, PtrVT); @@ -19856,3 +19855,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } + +bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const { + return Subtarget.useCRBits(); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 124c711..9755f0e 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1207,6 +1207,8 @@ namespace llvm { bool IsVarArg) const; bool supportsTailCallFor(const CallBase *CB) const; + bool hasMultipleConditionRegisters(EVT VT) const override; + private: struct ReuseLoadInfo { SDValue Ptr; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 9538b20..95ec42f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -327,19 +327,19 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { - MCContext &C = getContext(); - int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; + // If the label difference can be resolved, use the default handling, which + // utilizes a shorter special opcode. + if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) + return false; [[maybe_unused]] bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); assert(IsAbsolute && "CFA with invalid expression"); - Fixups.clear(); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -349,33 +349,21 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> Fixup; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { - unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); - assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - Fixup = RISCV::getRelocPairForSize(PtrSize); - OS.write_zeros(PtrSize); + PCBytes = getContext().getAsmInfo()->getCodePointerSize(); + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PCBytes + 1) + << uint8_t(dwarf::DW_LNE_set_address); + OS.write_zeros(PCBytes); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - Fixup = RISCV::getRelocPairForSize(2); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(Fixup))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(Fixup))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -386,7 +374,8 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } @@ -754,7 +743,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &, if (!AUIPCTarget.getAddSym()) return false; - const MCSymbolELF &SA = cast<MCSymbolELF>(*AUIPCTarget.getAddSym()); + auto &SA = static_cast<const MCSymbolELF &>(*AUIPCTarget.getAddSym()); if (SA.isUndefined()) return false; @@ -881,9 +870,8 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, } void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) @@ -898,15 +886,14 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index d97d632..adec1ec 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -46,8 +46,7 @@ public: void maybeAddVendorReloc(const MCFragment &, const MCFixup &); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 9bf7896..2885e3c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -55,8 +55,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_RISCV_TLS_GOT_HI20: case ELF::R_RISCV_TLS_GD_HI20: case ELF::R_RISCV_TLSDESC_HI20: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; case ELF::R_RISCV_PLT32: case ELF::R_RISCV_GOT32_PCREL: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index c654fd2b..543c4c5 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -117,7 +117,7 @@ void RISCVTargetELFStreamer::reset() { void RISCVTargetELFStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) { getStreamer().getAssembler().registerSymbol(Symbol); - cast<MCSymbolELF>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC); + static_cast<MCSymbolELF &>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC); } void RISCVELFStreamer::reset() { @@ -142,7 +142,8 @@ void RISCVELFStreamer::emitInstructionsMappingSymbol() { } void RISCVELFStreamer::emitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index f816561c..98c8738 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -68,27 +68,6 @@ enum Fixups { fixup_riscv_invalid, NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind }; - -static inline std::pair<MCFixupKind, MCFixupKind> -getRelocPairForSize(unsigned Size) { - switch (Size) { - default: - llvm_unreachable("unsupported fixup size"); - case 1: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD8, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB8); - case 2: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD16, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB16); - case 4: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD32, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB32); - case 8: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD64, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB64); - } -} - } // end namespace llvm::RISCV #endif diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 3655861..f70837e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -68,36 +68,30 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection( const Triple &Triple = Ctx.getTargetTriple(); Align NoteAlign; + uint64_t DescSize; if (Triple.isArch64Bit()) { NoteAlign = Align(8); + DescSize = 16; } else { assert(Triple.isArch32Bit()); NoteAlign = Align(4); + DescSize = 12; } assert(Ctx.getObjectFileType() == MCContext::Environment::IsELF); MCSection *const NoteSection = Ctx.getELFSection(".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); - NoteSection->setAlignment(NoteAlign); OutStreamer.pushSection(); OutStreamer.switchSection(NoteSection); // Emit the note header - OutStreamer.emitIntValue(4, 4); // n_namsz - - MCSymbol *const NDescBeginSym = Ctx.createTempSymbol(); - MCSymbol *const NDescEndSym = Ctx.createTempSymbol(); - const MCExpr *const NDescSzExpr = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(NDescEndSym, Ctx), - MCSymbolRefExpr::create(NDescBeginSym, Ctx), Ctx); - - OutStreamer.emitValue(NDescSzExpr, 4); // n_descsz + OutStreamer.emitValueToAlignment(NoteAlign); + OutStreamer.emitIntValue(4, 4); // n_namsz + OutStreamer.emitIntValue(DescSize, 4); // n_descsz OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); // n_type OutStreamer.emitBytes(StringRef("GNU", 4)); // n_name // Emit n_desc field - OutStreamer.emitLabel(NDescBeginSym); - OutStreamer.emitValueToAlignment(NoteAlign); // Emit the feature_1_and property OutStreamer.emitIntValue(ELF::GNU_PROPERTY_RISCV_FEATURE_1_AND, 4); // pr_type @@ -105,7 +99,6 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection( OutStreamer.emitIntValue(Feature1And, 4); // pr_data OutStreamer.emitValueToAlignment(NoteAlign); // pr_padding - OutStreamer.emitLabel(NDescEndSym); OutStreamer.popSection(); } diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 82c0d8d..80a48c5 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -167,9 +167,8 @@ static std::pair<Value *, Value *> matchStridedStart(Value *Start, default: llvm_unreachable("Unexpected opcode"); case Instruction::Or: - // TODO: We'd be better off creating disjoint or here, but we don't yet - // have an IRBuilder API for that. - [[fallthrough]]; + Start = Builder.CreateOr(Start, Splat, "", /*IsDisjoint=*/true); + break; case Instruction::Add: Start = Builder.CreateAdd(Start, Splat); break; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index adbfbeb..e09e6fb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7012,6 +7012,7 @@ static unsigned getRISCVVLOp(SDValue Op) { OP_CASE(FDIV) OP_CASE(FNEG) OP_CASE(FABS) + OP_CASE(FCOPYSIGN) OP_CASE(FSQRT) OP_CASE(SMIN) OP_CASE(SMAX) @@ -7079,6 +7080,15 @@ static unsigned getRISCVVLOp(SDValue Op) { if (Op.getSimpleValueType().getVectorElementType() == MVT::i1) return RISCVISD::VMXOR_VL; return RISCVISD::XOR_VL; + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + return RISCVISD::VZEXT_VL; + case ISD::SIGN_EXTEND: + return RISCVISD::VSEXT_VL; + case ISD::SETCC: + return RISCVISD::SETCC_VL; + case ISD::VSELECT: + return RISCVISD::VMERGE_VL; case ISD::VP_SELECT: case ISD::VP_MERGE: return RISCVISD::VMERGE_VL; @@ -7419,12 +7429,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (Op.getOperand(0).getValueType().isVector() && Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1) return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1); - return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VZEXT_VL); + if (Op.getValueType().isScalableVector()) + return Op; + return lowerToScalableOp(Op, DAG); case ISD::SIGN_EXTEND: if (Op.getOperand(0).getValueType().isVector() && Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1) return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1); - return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VSEXT_VL); + if (Op.getValueType().isScalableVector()) + return Op; + return lowerToScalableOp(Op, DAG); case ISD::SPLAT_VECTOR_PARTS: return lowerSPLAT_VECTOR_PARTS(Op, DAG); case ISD::INSERT_VECTOR_ELT: @@ -8166,7 +8180,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget)) return SplitVectorOp(Op, DAG); - return lowerFixedLengthVectorSetccToRVV(Op, DAG); + return lowerToScalableOp(Op, DAG); } case ISD::ADD: case ISD::SUB: @@ -8182,6 +8196,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::UREM: case ISD::BSWAP: case ISD::CTPOP: + case ISD::VSELECT: return lowerToScalableOp(Op, DAG); case ISD::SHL: case ISD::SRA: @@ -8250,14 +8265,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerToScalableOp(Op, DAG); assert(Op.getOpcode() != ISD::CTTZ); return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); - case ISD::VSELECT: - return lowerFixedLengthVectorSelectToRVV(Op, DAG); case ISD::FCOPYSIGN: if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) return lowerFCOPYSIGN(Op, DAG, Subtarget); if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); - return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); + return lowerToScalableOp(Op, DAG); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -9694,33 +9707,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VecVT, Select, DAG, Subtarget); } -SDValue RISCVTargetLowering::lowerFixedLengthVectorExtendToRVV( - SDValue Op, SelectionDAG &DAG, unsigned ExtendOpc) const { - MVT ExtVT = Op.getSimpleValueType(); - // Only custom-lower extensions from fixed-length vector types. - if (!ExtVT.isFixedLengthVector()) - return Op; - MVT VT = Op.getOperand(0).getSimpleValueType(); - // Grab the canonical container type for the extended type. Infer the smaller - // type from that to ensure the same number of vector elements, as we know - // the LMUL will be sufficient to hold the smaller type. - MVT ContainerExtVT = getContainerForFixedLengthVector(ExtVT); - // Get the extended container type manually to ensure the same number of - // vector elements between source and dest. - MVT ContainerVT = MVT::getVectorVT(VT.getVectorElementType(), - ContainerExtVT.getVectorElementCount()); - - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget); - - SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - - SDValue Ext = DAG.getNode(ExtendOpc, DL, ContainerExtVT, Op1, Mask, VL); - - return convertFromScalableVector(ExtVT, Ext, DAG, Subtarget); -} - // Custom-lower truncations from vectors to mask vectors by using a mask and a // setcc operation: // (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne) @@ -12834,31 +12820,6 @@ SDValue RISCVTargetLowering::lowerVectorCompress(SDValue Op, return Res; } -SDValue -RISCVTargetLowering::lowerFixedLengthVectorSetccToRVV(SDValue Op, - SelectionDAG &DAG) const { - MVT InVT = Op.getOperand(0).getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(InVT); - - MVT VT = Op.getSimpleValueType(); - - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget); - SDValue Op2 = - convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget); - - SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT.getVectorNumElements(), ContainerVT, DL, - DAG, Subtarget); - MVT MaskVT = getMaskTypeFor(ContainerVT); - - SDValue Cmp = - DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, - {Op1, Op2, Op.getOperand(2), DAG.getUNDEF(MaskVT), Mask, VL}); - - return convertFromScalableVector(VT, Cmp, DAG, Subtarget); -} - SDValue RISCVTargetLowering::lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); @@ -12985,51 +12946,6 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const { return Max; } -SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV( - SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - SDValue Mag = Op.getOperand(0); - SDValue Sign = Op.getOperand(1); - assert(Mag.getValueType() == Sign.getValueType() && - "Can only handle COPYSIGN with matching types."); - - MVT ContainerVT = getContainerForFixedLengthVector(VT); - Mag = convertToScalableVector(ContainerVT, Mag, DAG, Subtarget); - Sign = convertToScalableVector(ContainerVT, Sign, DAG, Subtarget); - - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - - SDValue CopySign = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Mag, - Sign, DAG.getUNDEF(ContainerVT), Mask, VL); - - return convertFromScalableVector(VT, CopySign, DAG, Subtarget); -} - -SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV( - SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - - MVT I1ContainerVT = - MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); - - SDValue CC = - convertToScalableVector(I1ContainerVT, Op.getOperand(0), DAG, Subtarget); - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget); - SDValue Op2 = - convertToScalableVector(ContainerVT, Op.getOperand(2), DAG, Subtarget); - - SDLoc DL(Op); - SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; - - SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1, - Op2, DAG.getUNDEF(ContainerVT), VL); - - return convertFromScalableVector(VT, Select, DAG, Subtarget); -} - SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { const auto &TSInfo = @@ -13056,7 +12972,9 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, // "cast" fixed length vector to a scalable vector. assert(useRVVForFixedLengthVectorVT(V.getSimpleValueType()) && "Only fixed length vectors are supported!"); - Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget)); + MVT VContainerVT = ContainerVT.changeVectorElementType( + V.getSimpleValueType().getVectorElementType()); + Ops.push_back(convertToScalableVector(VContainerVT, V, DAG, Subtarget)); } SDLoc DL(Op); @@ -21478,11 +21396,10 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode( // TODO: Add more target nodes. switch (Op.getOpcode()) { case RISCVISD::SELECT_CC: - // Integer select_cc cannot create poison. - // TODO: What are the FP poison semantics? - // TODO: This instruction blocks poison from the unselected operand, can - // we do anything with that? - return !Op.getValueType().isInteger(); + // Integer comparisons cannot create poison. + assert(Op.getOperand(0).getValueType().isInteger() && + "RISCVISD::SELECT_CC only compares integers"); + return false; } return TargetLowering::canCreateUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); @@ -22550,6 +22467,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( constexpr StringLiteral SupportedInterruptKinds[] = { "machine", "supervisor", + "rnmi", "qci-nest", "qci-nonest", "SiFive-CLIC-preemptible", @@ -22567,6 +22485,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments( reportFatalUsageError( "'SiFive-CLIC-*' interrupt kinds require XSfmclic extension"); + if (Kind == "rnmi" && !Subtarget.hasStdExtSmrnmi()) + reportFatalUsageError("'rnmi' interrupt kind requires Srnmi extension"); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); if (Kind.starts_with("SiFive-CLIC-preemptible") && TFI->hasFP(MF)) reportFatalUsageError("'SiFive-CLIC-preemptible' interrupt kinds cannot " @@ -23212,7 +23132,11 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (Kind == "supervisor") RetOpc = RISCVISD::SRET_GLUE; - else if (Kind == "qci-nest" || Kind == "qci-nonest") { + else if (Kind == "rnmi") { + assert(STI.hasFeature(RISCV::FeatureStdExtSmrnmi) && + "Need Smrnmi extension for rnmi"); + RetOpc = RISCVISD::MNRET_GLUE; + } else if (Kind == "qci-nest" || Kind == "qci-nonest") { assert(STI.hasFeature(RISCV::FeatureVendorXqciint) && "Need Xqciint for qci-(no)nest"); RetOpc = RISCVISD::QC_C_MILEAVERET_GLUE; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46..fa50e21 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -534,9 +534,6 @@ private: SDValue lowerMaskedScatter(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op, - SelectionDAG &DAG) const; SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const; @@ -551,8 +548,6 @@ private: SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPStridedStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPCttzElements(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG, - unsigned ExtendOpc) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 6536078..8bd3830 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -75,6 +75,8 @@ def riscv_sret_glue : RVSDNode<"SRET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; def riscv_mret_glue : RVSDNode<"MRET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; +def riscv_mnret_glue : RVSDNode<"MNRET_GLUE", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; def riscv_mileaveret_glue : RVSDNode<"QC_C_MILEAVERET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; @@ -935,7 +937,6 @@ def MRET : Priv<"mret", 0b0011000>, Sched<[]> { let rs1 = 0; let rs2 = 0b00010; } -} // isBarrier = 1, isReturn = 1, isTerminator = 1 let Predicates = [HasStdExtSmrnmi] in { def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> { @@ -944,6 +945,8 @@ def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> { let rs2 = 0b00010; } }// Predicates = [HasStdExtSmrnmi] +} // isBarrier = 1, isReturn = 1, isTerminator = 1 + def WFI : Priv<"wfi", 0b0001000>, Sched<[]> { let rd = 0; @@ -1801,6 +1804,8 @@ def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; def : Pat<(riscv_sret_glue), (SRET)>; def : Pat<(riscv_mret_glue), (MRET)>; +let Predicates = [HasStdExtSmrnmi] in +def : Pat<(riscv_mnret_glue), (MNRET)>; let isCall = 1, Defs = [X1] in { let Predicates = [NoStdExtZicfilp] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 5265613..2c64b0c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -14,6 +14,14 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// +def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, + SDTCisSameAs<1, 3>, + SDTCisPtrTy<2>, + SDTCisVT<3, XLenVT>]>; + +def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + def uimm5nonzero : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5, "NonZero">; @@ -27,6 +35,8 @@ def uimm5nonzero : RISCVOp<XLenVT>, }]; } +def tuimm5nonzero : TImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]>; + def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5, "GT3">; @@ -92,6 +102,8 @@ def uimm5slist : RISCVOp<XLenVT>, ImmLeaf<XLenVT, }]; } +def tuimm7_lsb00 : TImmLeaf<XLenVT,[{return isShiftedUInt<5, 2>(Imm);}]>; + def uimm10 : RISCVUImmLeafOp<10>; def uimm11 : RISCVUImmLeafOp<11>; @@ -457,6 +469,13 @@ class QCIRVInstRR<bits<5> funct5, DAGOperand InTyRs1, string opcodestr> : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd), (ins InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, "$rd, $rs1, $rs2">; +class QCIRVInstRRTied<bits<5> funct5, DAGOperand InTyRs1, string opcodestr> + : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { + let Constraints = "$rd = $rd_wb"; +} + class QCIBitManipRII<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs1, string opcodestr> : RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd), @@ -470,11 +489,26 @@ class QCIBitManipRII<bits<3> funct3, bits<2> funct2, let Inst{24-20} = shamt; } +class QCIBitManipRIITied<bits<3> funct3, bits<2> funct2, + DAGOperand InTyRs1, string opcodestr> + : RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, + InTyRs1:$rs1, uimm5_plus1:$width, uimm5:$shamt), + opcodestr, "$rd, $rs1, $width, $shamt"> { + let Constraints = "$rd = $rd_wb"; + bits<5> shamt; + bits<5> width; + + let Inst{31-30} = funct2; + let Inst{29-25} = width; + let Inst{24-20} = shamt; +} + class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11, string opcodestr> - : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd), - (ins GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr, + : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr, "$rd, $rs1, $imm11"> { + let Constraints = "$rd = $rd_wb"; bits<11> imm11; let Inst{31-31} = funct1; @@ -858,12 +892,12 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { let Inst{29-25} = width; let Inst{24-20} = shamt; } - def QC_INSB : QCIBitManipRII<0b001, 0b01, GPR, "qc.insb">; - def QC_INSBH : QCIBitManipRII<0b001, 0b10, GPR, "qc.insbh">; - def QC_INSBR : QCIRVInstRR<0b00000, GPR, "qc.insbr">; - def QC_INSBHR : QCIRVInstRR<0b00001, GPR, "qc.insbhr">; - def QC_INSBPR : QCIRVInstRR<0b00010, GPR, "qc.insbpr">; - def QC_INSBPRH : QCIRVInstRR<0b00011, GPR, "qc.insbprh">; + def QC_INSB : QCIBitManipRIITied<0b001, 0b01, GPR, "qc.insb">; + def QC_INSBH : QCIBitManipRIITied<0b001, 0b10, GPR, "qc.insbh">; + def QC_INSBR : QCIRVInstRRTied<0b00000, GPR, "qc.insbr">; + def QC_INSBHR : QCIRVInstRRTied<0b00001, GPR, "qc.insbhr">; + def QC_INSBPR : QCIRVInstRRTied<0b00010, GPR, "qc.insbpr">; + def QC_INSBPRH : QCIRVInstRRTied<0b00011, GPR, "qc.insbprh">; def QC_EXTU : QCIBitManipRII<0b010, 0b00, GPRNoX0, "qc.extu">; def QC_EXTDU : QCIBitManipRII<0b010, 0b10, GPRNoX31, "qc.extdu">; def QC_EXTDUR : QCIRVInstRR<0b00100, GPRNoX31, "qc.extdur">; @@ -1566,6 +1600,11 @@ def : QCISELECTIICCPat <SETEQ, QC_SELECTIIEQ>; def : QCISELECTIICCPat <SETNE, QC_SELECTIINE>; } // Predicates = [HasVendorXqcics, IsRV32] +let Predicates = [HasVendorXqcilsm, IsRV32] in { +def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), + (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>; +} // Predicates = [HasVendorXqcilsm, IsRV32] + //===----------------------------------------------------------------------===/i // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index a250ac8..5a5a9ed 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -206,8 +206,6 @@ let Predicates = [HasStdExtZvksh], RVVConstraint = VS2Constraint in { //===----------------------------------------------------------------------===// defvar I32IntegerVectors = !filter(vti, AllIntegerVectors, !eq(vti.SEW, 32)); -defvar I32I64IntegerVectors = !filter(vti, AllIntegerVectors, - !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64))); class ZvkI32IntegerVectors<string vd_lmul> { list<VTypeInfo> vs2_types = !cond(!eq(vd_lmul, "M8") : !filter(vti, I32IntegerVectors, !le(vti.LMul.octuple, 32)), @@ -1126,16 +1124,16 @@ let Predicates = [HasStdExtZvkned] in { defm : VPatUnaryV_S_NoMaskVectorCrypto<"int_riscv_vaesz", "PseudoVAESZ", I32IntegerVectors>; } // Predicates = [HasStdExtZvkned] -let Predicates = [HasStdExtZvknha] in { +let Predicates = [HasStdExtZvknhaOrZvknhb] in { defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknha] let Predicates = [HasStdExtZvknhb] in { - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I64IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknhb] let Predicates = [HasStdExtZvksed] in { diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp index 6ecddad..041dd07 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "RISCVSelectionDAGInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/SelectionDAG.h" #define GET_SDNODE_DESC #include "RISCVGenSDNodeInfo.inc" @@ -62,3 +64,94 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, } #endif } + +SDValue RISCVSelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo) const { + const auto &Subtarget = DAG.getSubtarget<RISCVSubtarget>(); + // We currently do this only for Xqcilsm + if (!Subtarget.hasVendorXqcilsm()) + return SDValue(); + + // Do this only if we know the size at compile time. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + + uint64_t NumberOfBytesToWrite = ConstantSize->getZExtValue(); + + // Do this only if it is word aligned and we write a multiple of 4 bytes. + if (!(Alignment >= 4) || !((NumberOfBytesToWrite & 3) == 0)) + return SDValue(); + + SmallVector<SDValue, 8> OutChains; + SDValue SrcValueReplicated = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + int NumberOfWords = NumberOfBytesToWrite / 4; + MachineFunction &MF = DAG.getMachineFunction(); + auto Volatile = + isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; + + // Helper for constructing the QC_SETWMI instruction + auto getSetwmiNode = [&](uint8_t SizeWords, uint8_t OffsetSetwmi) -> SDValue { + SDValue Ops[] = {Chain, SrcValueReplicated, Dst, + DAG.getTargetConstant(SizeWords, dl, MVT::i32), + DAG.getTargetConstant(OffsetSetwmi, dl, MVT::i32)}; + MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand( + DstPtrInfo.getWithOffset(OffsetSetwmi), + MachineMemOperand::MOStore | Volatile, SizeWords * 4, Align(4)); + return DAG.getMemIntrinsicNode(RISCVISD::QC_SETWMI, dl, + DAG.getVTList(MVT::Other), Ops, MVT::i32, + BaseMemOperand); + }; + + // If i8 type and constant non-zero value. + if ((Src.getValueType() == MVT::i8) && !isNullConstant(Src)) + // Replicate byte to word by multiplication with 0x01010101. + SrcValueReplicated = + DAG.getNode(ISD::MUL, dl, MVT::i32, SrcValueReplicated, + DAG.getConstant(0x01010101ul, dl, MVT::i32)); + + // We limit a QC_SETWMI to 16 words or less to improve interruptibility. + // So for 1-16 words we use a single QC_SETWMI: + // + // QC_SETWMI reg1, N, 0(reg2) + // + // For 17-32 words we use two QC_SETWMI's with the first as 16 words and the + // second for the remainder: + // + // QC_SETWMI reg1, 16, 0(reg2) + // QC_SETWMI reg1, N, 64(reg2) + // + // For 33-48 words, we would like to use (16, 16, n), but that means the last + // QC_SETWMI needs an offset of 128 which the instruction doesn't support. + // So in this case we use a length of 15 for the second instruction and we do + // the rest with the third instruction. + // This means the maximum inlined number of words is 47 (for now): + // + // QC_SETWMI R2, R0, 16, 0 + // QC_SETWMI R2, R0, 15, 64 + // QC_SETWMI R2, R0, N, 124 + // + // For 48 words or more, call the target independent memset + if (NumberOfWords >= 48) + return SDValue(); + + if (NumberOfWords <= 16) { + // 1 - 16 words + return getSetwmiNode(NumberOfWords, 0); + } + + if (NumberOfWords <= 32) { + // 17 - 32 words + OutChains.push_back(getSetwmiNode(NumberOfWords - 16, 64)); + OutChains.push_back(getSetwmiNode(16, 0)); + } else { + // 33 - 47 words + OutChains.push_back(getSetwmiNode(NumberOfWords - 31, 124)); + OutChains.push_back(getSetwmiNode(15, 64)); + OutChains.push_back(getSetwmiNode(16, 0)); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); +} diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h index 641189f..08c8d11 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h @@ -34,6 +34,12 @@ public: void verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const override; + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo) const override; + bool hasPassthruOp(unsigned Opcode) const { return GenNodeInfo.getDesc(Opcode).TSFlags & RISCVISD::HasPassthruOpMask; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index da6ac2f..3f2a83f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -642,12 +642,6 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { OptimizationLevel Level) { LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); }); - - PB.registerVectorizerEndEPCallback( - [](FunctionPassManager &FPM, OptimizationLevel Level) { - if (Level.isOptimizingForSpeed()) - FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass())); - }); } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 0d5eb86..67f924a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,11 +979,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && - Factor <= TLI->getMaxSupportedInterleaveFactor()) { + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg + // only support masking per-iteration (i.e. condition), not per-segment (i.e. + // gap). + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast<VectorType>(VecTy); std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99c..05d504c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ public: bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return ST->hasVInstructions(); + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index c946451..37a71e8 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -69,6 +69,7 @@ struct OperandInfo { // Represent as 1,2,4,8, ... and fractional indicator. This is because // EMUL can take on values that don't map to RISCVVType::VLMUL values exactly. // For example, a mask operand can have an EMUL less than MF8. + // If nullopt, then EMUL isn't used (i.e. only a single scalar is read). std::optional<std::pair<unsigned, bool>> EMUL; unsigned Log2EEW; @@ -83,12 +84,14 @@ struct OperandInfo { OperandInfo() = delete; - static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - return A.Log2EEW == B.Log2EEW && A.EMUL == B.EMUL; - } - - static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - return A.Log2EEW == B.Log2EEW; + /// Return true if the EMUL and EEW produced by \p Def is compatible with the + /// EMUL and EEW used by \p User. + static bool areCompatible(const OperandInfo &Def, const OperandInfo &User) { + if (Def.Log2EEW != User.Log2EEW) + return false; + if (User.EMUL && Def.EMUL != User.EMUL) + return false; + return true; } void print(raw_ostream &OS) const { @@ -98,7 +101,7 @@ struct OperandInfo { OS << "f"; OS << EMUL->first; } else - OS << "EMUL: unknown\n"; + OS << "EMUL: none\n"; OS << ", EEW: " << (1 << Log2EEW); } }; @@ -1399,13 +1402,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { return std::nullopt; } - // If the operand is used as a scalar operand, then the EEW must be - // compatible. Otherwise, the EMUL *and* EEW must be compatible. - bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp); - if ((IsVectorOpUsedAsScalarOp && - !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) || - (!IsVectorOpUsedAsScalarOp && - !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) { + if (!OperandInfo::areCompatible(*ProducerInfo, *ConsumerInfo)) { LLVM_DEBUG( dbgs() << " Abort due to incompatible information for EMUL or EEW.\n"); diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp index ef84d43..5710cf2 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp @@ -21,8 +21,7 @@ public: SPIRVAsmBackend(llvm::endianness Endian) : MCAsmBackend(Endian) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index a7f6fbc..64d301e 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -375,7 +375,7 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) O << '%' << (getIDFromRegister(Op.getReg().id()) + 1); else if (Op.isImm()) - O << formatImm((int64_t)Op.getImm()); + O << formatImm(Op.getImm()); else if (Op.isDFPImm()) O << formatImm((double)Op.getDFPImm()); else if (Op.isExpr()) diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 947b574..2c3e087 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -21,7 +21,9 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/TypedPointerType.h" +#include "llvm/Transforms/Utils/Local.h" #include <queue> #include <unordered_set> @@ -187,6 +189,8 @@ class SPIRVEmitIntrinsics void applyDemangledPtrArgTypes(IRBuilder<> &B); + GetElementPtrInst *simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP); + bool runOnFunction(Function &F); bool postprocessTypes(Module &M); bool processFunctionPointers(Module &M); @@ -1458,6 +1462,24 @@ static void createSaturatedConversionDecoration(Instruction *I, createDecorationIntrinsic(I, SaturatedConversionNode, B); } +static void addSaturatedDecorationToIntrinsic(Instruction *I, IRBuilder<> &B) { + if (auto *CI = dyn_cast<CallInst>(I)) { + if (Function *Fu = CI->getCalledFunction()) { + if (Fu->isIntrinsic()) { + unsigned const int IntrinsicId = Fu->getIntrinsicID(); + switch (IntrinsicId) { + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: + createSaturatedConversionDecoration(I, B); + break; + default: + break; + } + } + } + } +} + Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) { if (!Call.isInlineAsm()) return &Call; @@ -2543,6 +2565,30 @@ void SPIRVEmitIntrinsics::applyDemangledPtrArgTypes(IRBuilder<> &B) { } } +GetElementPtrInst * +SPIRVEmitIntrinsics::simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP) { + // getelementptr [0 x T], P, 0 (zero), I -> getelementptr T, P, I. + // If type is 0-length array and first index is 0 (zero), drop both the + // 0-length array type and the first index. This is a common pattern in the + // IR, e.g. when using a zero-length array as a placeholder for a flexible + // array such as unbound arrays. + assert(GEP && "GEP is null"); + Type *SrcTy = GEP->getSourceElementType(); + SmallVector<Value *, 8> Indices(GEP->indices()); + ArrayType *ArrTy = dyn_cast<ArrayType>(SrcTy); + if (ArrTy && ArrTy->getNumElements() == 0 && + PatternMatch::match(Indices[0], PatternMatch::m_Zero())) { + IRBuilder<> Builder(GEP); + Indices.erase(Indices.begin()); + SrcTy = ArrTy->getElementType(); + Value *NewGEP = Builder.CreateGEP(SrcTy, GEP->getPointerOperand(), Indices, + "", GEP->getNoWrapFlags()); + assert(llvm::isa<GetElementPtrInst>(NewGEP) && "NewGEP should be a GEP"); + return cast<GetElementPtrInst>(NewGEP); + } + return nullptr; +} + bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { if (Func.isDeclaration()) return false; @@ -2560,14 +2606,30 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { AggrConstTypes.clear(); AggrStores.clear(); - // fix GEP result types ahead of inference + // Fix GEP result types ahead of inference, and simplify if possible. + // Data structure for dead instructions that were simplified and replaced. + SmallPtrSet<Instruction *, 4> DeadInsts; for (auto &I : instructions(Func)) { auto *Ref = dyn_cast<GetElementPtrInst>(&I); if (!Ref || GR->findDeducedElementType(Ref)) continue; + + GetElementPtrInst *NewGEP = simplifyZeroLengthArrayGepInst(Ref); + if (NewGEP) { + Ref->replaceAllUsesWith(NewGEP); + if (isInstructionTriviallyDead(Ref)) + DeadInsts.insert(Ref); + + Ref = NewGEP; + } if (Type *GepTy = getGEPType(Ref)) GR->addDeducedElementType(Ref, normalizeType(GepTy)); } + // Remove dead instructions that were simplified and replaced. + for (auto *I : DeadInsts) { + assert(I->use_empty() && "Dead instruction should not have any uses left"); + I->eraseFromParent(); + } processParamTypesByFunHeader(CurrF, B); @@ -2640,6 +2702,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { if (isConvergenceIntrinsic(I)) continue; + addSaturatedDecorationToIntrinsic(I, B); processInstrAfterVisit(I, B); } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 83fccdc..960eb2e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -828,6 +828,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, "Invalid array element type"); SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); SPIRVType *ArrayType = nullptr; + const SPIRVSubtarget &ST = + cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget()); if (NumElems != 0) { Register NumElementsVReg = buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); @@ -838,6 +840,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, .addUse(NumElementsVReg); }); } else { + if (!ST.isShader()) + return nullptr; ArrayType = createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray) .addDef(createTypeVReg(MIRBuilder)) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index d4fa62a..e9f5ffa 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -665,6 +665,11 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, case TargetOpcode::G_FPTOUI: return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU); + case TargetOpcode::G_FPTOSI_SAT: + return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToS); + case TargetOpcode::G_FPTOUI_SAT: + return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU); + case TargetOpcode::G_SITOFP: return selectIToF(ResVReg, ResType, I, true, SPIRV::OpConvertSToF); case TargetOpcode::G_UITOFP: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 1995e0f..170bddd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -203,6 +203,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { .legalForCartesianProduct(allIntScalarsAndVectors, allFloatScalarsAndVectors); + getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT}) + .legalForCartesianProduct(allIntScalarsAndVectors, + allFloatScalarsAndVectors); + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct(allFloatScalarsAndVectors, allScalarsAndVectors); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 0cd9d78..ab06fc0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -744,8 +744,14 @@ void SPIRV::RequirementHandler::checkSatisfiable( IsSatisfiable = false; } + AvoidCapabilitiesSet AvoidCaps; + if (!ST.isShader()) + AvoidCaps.S.insert(SPIRV::Capability::Shader); + else + AvoidCaps.S.insert(SPIRV::Capability::Kernel); + for (auto Cap : MinimalCaps) { - if (AvailableCaps.contains(Cap)) + if (AvailableCaps.contains(Cap) && !AvoidCaps.S.contains(Cap)) continue; LLVM_DEBUG(dbgs() << "Capability not supported: " << getSymbolicOperandMnemonic( @@ -1865,6 +1871,11 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::TernaryBitwiseFunctionINTEL); break; } + case SPIRV::OpCopyMemorySized: { + Reqs.addCapability(SPIRV::Capability::Addresses); + // TODO: Add UntypedPointersKHR when implemented. + break; + } default: break; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index ba023af..bc60842 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -127,8 +127,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override { @@ -253,21 +252,19 @@ MCFixupKindInfo SparcAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SparcAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (!IsResolved) return; Value = adjustFixupValue(Fixup.getKind(), Value); unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index a95c4ff..d2071c3 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -58,8 +58,8 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_SPARC_TLS_IE_ADD: case ELF::R_SPARC_TLS_LE_HIX22: case ELF::R_SPARC_TLS_LE_LOX10: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index 8b5587a..1bca5c7 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -111,8 +111,8 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup, case SystemZ::S_TLSLD: case SystemZ::S_TLSLDM: case SystemZ::S_DTPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index b2cfd04..d692cbe 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -113,8 +113,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; }; @@ -152,20 +151,18 @@ MCFixupKindInfo SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) return; - unsigned Offset = Fixup.getOffset(); unsigned BitSize = getFixupKindInfo(Kind).TargetSize; unsigned Size = (BitSize + 7) / 8; - assert(Offset + Size <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); // Big-endian insertion of Size bytes. Value = extractBitsForFixup(Kind, Value, Fixup, getContext()); @@ -173,7 +170,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value &= ((uint64_t)1 << BitSize) - 1; unsigned ShiftValue = (Size * 8) - 8; for (unsigned I = 0; I != Size; ++I) { - Data[Offset + I] |= uint8_t(Value >> ShiftValue); + Data[I] |= uint8_t(Value >> ShiftValue); ShiftValue -= 8; } } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index ae6ca55a36..783f86a 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1286,7 +1286,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( if ((Opcode == SystemZ::ALFI && OpNum == 0 && isInt<8>((int32_t)MI.getOperand(2).getImm())) || (Opcode == SystemZ::ALGFI && OpNum == 0 && - isInt<8>((int64_t)MI.getOperand(2).getImm()))) { + isInt<8>(MI.getOperand(2).getImm()))) { // AL(G)FI %reg, CONST -> AL(G)SI %mem, CONST Opcode = (Opcode == SystemZ::ALFI ? SystemZ::ALSI : SystemZ::ALGSI); MachineInstr *BuiltMI = @@ -1301,7 +1301,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( if ((Opcode == SystemZ::SLFI && OpNum == 0 && isInt<8>((int32_t)-MI.getOperand(2).getImm())) || (Opcode == SystemZ::SLGFI && OpNum == 0 && - isInt<8>((int64_t)-MI.getOperand(2).getImm()))) { + isInt<8>((-MI.getOperand(2).getImm())))) { // SL(G)FI %reg, CONST -> AL(G)SI %mem, -CONST Opcode = (Opcode == SystemZ::SLFI ? SystemZ::ALSI : SystemZ::ALGSI); MachineInstr *BuiltMI = diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index b02b6af..c1b9d9f 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -112,8 +112,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char>, uint64_t Value, - bool IsResolved) override; + uint8_t *, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override { @@ -152,7 +151,7 @@ public: } // end anonymous namespace void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { switch (Fixup.getKind()) { case VE::fixup_ve_tls_gd_hi32: @@ -173,14 +172,14 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits // from the fixup value. The Value has been "split up" into the // appropriate bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 41f31eb..c702064 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -44,8 +44,8 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup, case VE::S_TLS_GD_LO32: case VE::S_TPOFF_HI32: case VE::S_TPOFF_LO32: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 6ae69a4..80df4ed 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -212,12 +212,12 @@ static wasm::WasmLimits defaultLimits() { static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx, const StringRef &Name, bool Is64) { - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(Is64); // The default function table is synthesized by the linker. Sym->setUndefined(); @@ -703,7 +703,7 @@ public: ExpectBlockType = false; // The "true" here will cause this to be a nameless symbol. MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true); - auto *WasmSym = cast<MCSymbolWasm>(Sym); + auto *WasmSym = static_cast<MCSymbolWasm *>(Sym); WasmSym->setSignature(Signature); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = @@ -949,7 +949,8 @@ public: return error("Unknown type in .globaltype modifier: ", TypeTok); } // Now set this symbol with the correct type. - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(*Type), Mutable}); // And emit the directive again. @@ -980,7 +981,8 @@ public: // Now that we have the name and table type, we can actually create the // symbol - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); if (Is64) { Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64; @@ -1000,7 +1002,8 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); if (WasmSym->isDefined()) { // We push 'Function' either when a label is parsed or a .functype // directive is parsed. The reason it is not easy to do this uniformly @@ -1042,7 +1045,8 @@ public: auto ExportName = expectIdent(); if (ExportName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setExportName(Ctx.allocateString(ExportName)); TOut.emitExportName(WasmSym, ExportName); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1057,7 +1061,8 @@ public: auto ImportModule = expectIdent(); if (ImportModule.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setImportModule(Ctx.allocateString(ImportModule)); TOut.emitImportModule(WasmSym, ImportModule); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1072,7 +1077,8 @@ public: auto ImportName = expectIdent(); if (ImportName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setImportName(Ctx.allocateString(ImportName)); TOut.emitImportName(WasmSym, ImportName); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1082,7 +1088,8 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); auto *Signature = Ctx.createWasmSignature(); if (parseRegTypeList(Signature->Params)) return ParseStatus::Failure; @@ -1224,7 +1231,7 @@ public: if (!CWS->isText()) return; - auto *WasmSym = cast<MCSymbolWasm>(Symbol); + auto *WasmSym = static_cast<MCSymbolWasm *>(Symbol); // Unlike other targets, we don't allow data in text sections (labels // declared with .type @object). if (WasmSym->getType() == wasm::WASM_SYMBOL_TYPE_DATA) { diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index 4a305ab..6943888 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -258,7 +258,7 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCSymbolRefExpr *SymRef; if (getSymRef(ErrorLoc, GlobalOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) { case wasm::WASM_SYMBOL_TYPE_GLOBAL: Type = static_cast<wasm::ValType>(WasmSym->getGlobalType().Type); @@ -286,7 +286,7 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp, const MCSymbolRefExpr *SymRef; if (getSymRef(ErrorLoc, TableOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) != wasm::WASM_SYMBOL_TYPE_TABLE) return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() + @@ -302,7 +302,7 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc, const MCSymbolRefExpr *SymRef = nullptr; if (getSymRef(ErrorLoc, SigOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); Sig = WasmSym->getSignature(); if (!Sig || WasmSym->getType() != Type) { diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 0f7b27b..2a398d4 100644 --- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -237,7 +237,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( } else { // We don't have access to the signature, so create a symbol without one MCSymbol *Sym = getContext().createTempSymbol("typeindex", true); - auto *WasmSym = cast<MCSymbolWasm>(Sym); + auto *WasmSym = static_cast<MCSymbolWasm *>(Sym); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = MCSymbolRefExpr::create( WasmSym, WebAssembly::S_TYPEINDEX, getContext()); diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 84eb15f..eecef31 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -39,7 +39,7 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, bool) override; + uint8_t *Data, uint64_t Value, bool) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -80,8 +80,7 @@ bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -96,13 +95,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } std::unique_ptr<MCObjectTargetWriter> diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index 2e97215..d8bfed9 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -380,7 +380,7 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, O << WebAssembly::anyTypeToString(Imm); } else { auto Expr = cast<MCSymbolRefExpr>(Op.getExpr()); - auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol()); + auto *Sym = static_cast<const MCSymbolWasm *>(&Expr->getSymbol()); if (Sym->getSignature()) { O << WebAssembly::signatureToString(Sym->getSignature()); } else { @@ -398,10 +398,10 @@ void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo, auto PrintTagOp = [&](const MCOperand &Op) { const MCSymbolRefExpr *TagExpr = nullptr; - const MCSymbolWasm *TagSym = nullptr; + const MCSymbol *TagSym = nullptr; if (Op.isExpr()) { TagExpr = cast<MCSymbolRefExpr>(Op.getExpr()); - TagSym = cast<MCSymbolWasm>(&TagExpr->getSymbol()); + TagSym = &TagExpr->getSymbol(); O << TagSym->getName() << " "; } else { // When instructions are parsed from the disassembler, we have an diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index cbaf10f..7096104 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -107,7 +107,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( encodeULEB128(uint32_t(MO.getImm()), OS); break; case WebAssembly::OPERAND_I64IMM: - encodeSLEB128(int64_t(MO.getImm()), OS); + encodeSLEB128(MO.getImm(), OS); break; case WebAssembly::OPERAND_SIGNATURE: case WebAssembly::OPERAND_VEC_I8IMM: diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 2cf4bec..ffbc7e1 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -66,7 +66,7 @@ static const MCSection *getTargetSection(const MCExpr *Expr) { unsigned WebAssemblyWasmObjectWriter::getRelocType( const MCValue &Target, const MCFixup &Fixup, const MCSectionWasm &FixupSection, bool IsLocRel) const { - auto &SymA = cast<MCSymbolWasm>(*Target.getAddSym()); + auto &SymA = static_cast<const MCSymbolWasm &>(*Target.getAddSym()); auto Spec = WebAssembly::Specifier(Target.getSpecifier()); switch (Spec) { case WebAssembly::S_GOT: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 1bf070e..db832bc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -171,10 +171,10 @@ MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction( WebAssembly::signatureToString(Sig); report_fatal_error(Twine(Msg)); } - WasmSym = cast<MCSymbolWasm>( + WasmSym = static_cast<MCSymbolWasm *>( GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig))); } else { - WasmSym = cast<MCSymbolWasm>(getSymbol(F)); + WasmSym = static_cast<MCSymbolWasm *>(getSymbol(F)); } return WasmSym; } @@ -186,9 +186,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } assert(!GV->isThreadLocal()); - - MCSymbolWasm *Sym = cast<MCSymbolWasm>(getSymbol(GV)); - + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(GV)); if (!Sym->getType()) { SmallVector<MVT, 1> VTs; Type *GlobalVT = GV->getValueType(); @@ -218,8 +216,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) { - auto *WasmSym = cast<MCSymbolWasm>(GetExternalSymbolSymbol(Name)); - + auto *WasmSym = static_cast<MCSymbolWasm *>(GetExternalSymbolSymbol(Name)); // May be called multiple times, so early out. if (WasmSym->getType()) return WasmSym; @@ -312,7 +309,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { // not be found here. MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>(); for (StringRef Name : MMIW.MachineSymbolsUsed) { - auto *WasmSym = cast<MCSymbolWasm>(getOrCreateWasmSymbol(Name)); + auto *WasmSym = static_cast<MCSymbolWasm *>(getOrCreateWasmSymbol(Name)); if (WasmSym->isFunction()) { // TODO(wvo): is there any case where this overlaps with the call to // emitFunctionType in the loop below? @@ -324,7 +321,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { // Emit .globaltype, .tagtype, or .tabletype declarations for extern // declarations, i.e. those that have only been declared (but not defined) // in the current module - auto Sym = cast_or_null<MCSymbolWasm>(It.getValue().Symbol); + auto Sym = static_cast<MCSymbolWasm *>(It.getValue().Symbol); if (Sym && !Sym->isDefined()) emitSymbolType(Sym); } @@ -381,7 +378,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { } if (F.hasFnAttribute("wasm-export-name")) { - auto *Sym = cast<MCSymbolWasm>(getSymbol(&F)); + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(&F)); StringRef Name = F.getFnAttribute("wasm-export-name").getValueAsString(); Sym->setExportName(OutContext.allocateString(Name)); getTargetStreamer()->emitExportName(Sym, Name); @@ -581,7 +578,7 @@ void WebAssemblyAsmPrinter::EmitFunctionAttributes(Module &M) { auto *GV = cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts()); StringRef AnnotationString; getConstantStringInfo(GV, AnnotationString); - auto *Sym = cast<MCSymbolWasm>(getSymbol(F)); + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(F)); CustomSections[AnnotationString].push_back(Sym); } @@ -618,7 +615,7 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() { computeSignatureVTs(F.getFunctionType(), &F, F, TM, ParamVTs, ResultVTs); auto Signature = signatureFromMVTs(OutContext, ResultVTs, ParamVTs); - auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym); + auto *WasmSym = static_cast<MCSymbolWasm *>(CurrentFnSym); WasmSym->setSignature(Signature); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 4613fcb..e48283a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -52,7 +52,7 @@ MCSymbol * WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { const GlobalValue *Global = MO.getGlobal(); if (!isa<Function>(Global)) { - auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global)); + auto *WasmSym = static_cast<MCSymbolWasm *>(Printer.getSymbol(Global)); // If the symbol doesn't have an explicit WasmSymbolType yet and the // GlobalValue is actually a WebAssembly global, then ensure the symbol is a // WASM_SYMBOL_TYPE_GLOBAL. @@ -123,7 +123,7 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO, const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Spec, Ctx); if (MO.getOffset() != 0) { - const auto *WasmSym = cast<MCSymbolWasm>(Sym); + const auto *WasmSym = static_cast<const MCSymbolWasm *>(Sym); if (TargetFlags == WebAssemblyII::MO_GOT) report_fatal_error("GOT symbol references do not support offsets"); if (WasmSym->isFunction()) @@ -148,12 +148,12 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( auto Signature = Ctx.createWasmSignature(); Signature->Returns = std::move(Returns); Signature->Params = std::move(Params); - MCSymbol *Sym = Printer.createTempSymbol("typeindex"); - auto *WasmSym = cast<MCSymbolWasm>(Sym); - WasmSym->setSignature(Signature); - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + auto *Sym = + static_cast<MCSymbolWasm *>(Printer.createTempSymbol("typeindex")); + Sym->setSignature(Signature); + Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = - MCSymbolRefExpr::create(WasmSym, WebAssembly::S_TYPEINDEX, Ctx); + MCSymbolRefExpr::create(Sym, WebAssembly::S_TYPEINDEX, Ctx); return MCOperand::createExpr(Expr); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 747ef18..42d1271 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -104,13 +104,13 @@ const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) { MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( MCContext &Ctx, const WebAssemblySubtarget *Subtarget) { StringRef Name = "__indirect_function_table"; - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit(); - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); @@ -124,12 +124,12 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( MCSymbolWasm *WebAssembly::getOrCreateFuncrefCallTableSymbol( MCContext &Ctx, const WebAssemblySubtarget *Subtarget) { StringRef Name = "__funcref_call_table"; - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); // Setting Weak ensure only one table is left after linking when multiple // modules define the table. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 1efef83..56a4cc3 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -174,8 +174,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -512,9 +511,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, isFirstMacroFusibleInst(Inst, *MCII))) { // If we meet a unfused branch or the first instuction in a fusiable pair, // insert a BoundaryAlign fragment. - PendingBA = OS.getContext().allocFragment<MCBoundaryAlignFragment>( - AlignBoundary, STI); - OS.insert(PendingBA); + PendingBA = + OS.newSpecialFragment<MCBoundaryAlignFragment>(AlignBoundary, STI); } } @@ -676,9 +674,8 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &, } void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // Force relocation when there is a specifier. This might be too conservative // - GAS doesn't emit a relocation for call local@plt; local:. if (Target.getSpecifier()) @@ -710,7 +707,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + Data[i] = uint8_t(Value >> (i * 8)); } bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode, diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 3323b38..ea0abdd 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -349,8 +349,8 @@ unsigned X86ELFObjectWriter::getRelocType(const MCFixup &Fixup, case X86::S_TLSLDM: case X86::S_TPOFF: case X86::S_DTPOFF: - if (auto *S = Target.getAddSym()) - cast<MCSymbolELF>(S)->setType(ELF::STT_TLS); + if (auto *S = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(S)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index b8e117b..ff27005 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -369,7 +369,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { printRegName(O, Op.getReg()); } else if (Op.isImm()) { - markup(O, Markup::Immediate) << formatImm((int64_t)Op.getImm()); + markup(O, Markup::Immediate) << formatImm(Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << "offset "; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bbbb1d9..f366094 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8279,8 +8279,8 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, - unsigned &NumExtracts, - bool &IsSubAdd) { + unsigned &NumExtracts, bool &IsSubAdd, + bool &HasAllowContract) { using namespace SDPatternMatch; MVT VT = BV->getSimpleValueType(0); @@ -8292,6 +8292,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; + HasAllowContract = NumElts != 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. @@ -8350,6 +8351,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, // Increment the number of extractions done. ++NumExtracts; + HasAllowContract &= Op->getFlags().hasAllowContract(); } // Ensure we have found an opcode for both parities and that they are @@ -8393,9 +8395,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, - SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, - unsigned ExpectedUses) { + SelectionDAG &DAG, SDValue &Opnd0, + SDValue &Opnd1, SDValue &Opnd2, + unsigned ExpectedUses, + bool AllowSubAddOrAddSubContract) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; @@ -8406,7 +8409,8 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + Options.AllowFPOpFusion == FPOpFusion::Fast || + (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract()); if (!AllowFusion) return false; @@ -8427,15 +8431,17 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; - if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, - IsSubAdd)) + bool HasAllowContract; + if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd, + HasAllowContract)) return SDValue(); MVT VT = BV->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts, + HasAllowContract)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } @@ -9132,11 +9138,17 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; + + auto PeekThroughFreeze = [](SDValue N) { + if (N->getOpcode() == ISD::FREEZE && N.hasOneUse()) + return N->getOperand(0); + return N; + }; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { - SDValue Op = V.getOperand(Idx); + SDValue Op = PeekThroughFreeze(V.getOperand(Idx)); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); @@ -23486,7 +23498,6 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, } // Try to shrink i64 compares if the input has enough zero bits. - // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)? if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) && Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) && @@ -23496,6 +23507,16 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } + // Try to shrink all i64 compares if the inputs are representable as signed + // i32. + if (CmpVT == MVT::i64 && + Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. + DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) { + CmpVT = MVT::i32; + Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); + Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); + } + // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && @@ -43165,7 +43186,7 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) { /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, - bool &IsSubAdd) { + bool &IsSubAdd, bool &HasAllowContract) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -43216,6 +43237,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, // It's a subadd if the vector in the even parity is an FADD. IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD; + HasAllowContract = + V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract(); Opnd0 = LHS; Opnd1 = RHS; @@ -43273,14 +43296,17 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, SDValue Opnd0, Opnd1; bool IsSubAdd; - if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd)) + bool HasAllowContract; + if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd, + HasAllowContract)) return SDValue(); MVT VT = N->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) { + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2, + HasAllowContract)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } @@ -54220,7 +54246,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, } // Try to form a MULHU or MULHS node by looking for -// (trunc (srl (mul ext, ext), 16)) +// (trunc (srl (mul ext, ext), >= 16)) // TODO: This is X86 specific because we want to be able to handle wide types // before type legalization. But we can only do it if the vector will be // legalized via widening/splitting. Type legalization can't handle promotion @@ -54245,10 +54271,16 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // First instruction should be a right shift by 16 of a multiply. SDValue LHS, RHS; + APInt ShiftAmt; if (!sd_match(Src, - m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16)))) + m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt)))) return SDValue(); + if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits())) + return SDValue(); + + uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16; + // Count leading sign/zero bits on both inputs - if there are enough then // truncation back to vXi16 will be cheap - either as a pack/shuffle // sequence or using AVX512 truncations. If the inputs are sext/zext then the @@ -54286,7 +54318,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, InVT.getSizeInBits() / 16); SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), DAG.getBitcast(BCVT, RHS)); - return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + return DAG.getNode(ISD::SRL, DL, VT, Res, + DAG.getShiftAmountConstant(AdditionalShift, VT, DL)); } // Truncate back to source type. @@ -54294,7 +54328,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS); + return DAG.getNode(ISD::SRL, DL, VT, Res, + DAG.getShiftAmountConstant(AdditionalShift, VT, DL)); } // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 5862c7e..7c594d0 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2781,6 +2781,38 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Bytes == MFI.getObjectSize(FI); } +static bool +mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI, + Register CallerSRetReg) { + const auto &Outs = CLI.Outs; + const auto &OutVals = CLI.OutVals; + + // We know the caller has a sret pointer argument (CallerSRetReg). Locate the + // operand index within the callee that may have a sret pointer too. + unsigned Pos = 0; + for (unsigned E = Outs.size(); Pos != E; ++Pos) + if (Outs[Pos].Flags.isSRet()) + break; + // Bail out if the callee has not any sret argument. + if (Pos == Outs.size()) + return false; + + // At this point, either the caller is forwarding its sret argument to the + // callee, or the callee is being passed a different sret pointer. We now look + // for a CopyToReg, where the callee sret argument is written into a new vreg + // (which should later be %rax/%eax, if this is returned). + SDValue SRetArgVal = OutVals[Pos]; + for (SDNode *User : SRetArgVal->users()) { + if (User->getOpcode() != ISD::CopyToReg) + continue; + Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); + if (Reg == CallerSRetReg && User->getOperand(2) == SRetArgVal) + return true; + } + + return false; +} + /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. /// Note that the x86 backend does not check musttail calls for eligibility! The @@ -2802,6 +2834,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, @@ -2838,14 +2871,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( if (RegInfo->hasStackRealignment(MF)) return false; - // Also avoid sibcall optimization if we're an sret return fn and the callee - // is incompatible. See comment in LowerReturn about why hasStructRetAttr is - // insufficient. - if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { + // Avoid sibcall optimization if we are an sret return function and the callee + // is incompatible, unless such premises are proven wrong. See comment in + // LowerReturn about why hasStructRetAttr is insufficient. + if (Register SRetReg = FuncInfo->getSRetReturnReg()) { // For a compatible tail call the callee must return our sret pointer. So it // needs to be (a) an sret function itself and (b) we pass our sret as its // sret. Condition #b is harder to determine. - return false; + if (!mayBeSRetTailCallCompatible(CLI, SRetReg)) + return false; } else if (IsCalleePopSRet) // The callee pops an sret, so we cannot tail-call, as our caller doesn't // expect that. @@ -2967,8 +3001,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); - if (unsigned BytesToPop = - MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp index 9167794..08936ad 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp @@ -37,8 +37,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; @@ -153,9 +152,8 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F, } void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCContext &Ctx = getContext(); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -168,11 +166,10 @@ void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned FullSize = getSize(Fixup.getKind()); for (unsigned i = 0; i != FullSize; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index dcb30b7..08944e6 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -535,9 +535,8 @@ void ARM::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) { } } -StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { - StringRef ArchName = - CPU.empty() ? TT.getArchName() : getArchName(parseCPUArch(CPU)); +StringRef ARM::computeDefaultTargetABI(const Triple &TT) { + StringRef ArchName = TT.getArchName(); if (TT.isOSBinFormatMachO()) { if (TT.getEnvironment() == Triple::EABI || @@ -575,10 +574,9 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { } } -ARM::ARMABI ARM::computeTargetABI(const Triple &TT, StringRef CPU, - StringRef ABIName) { +ARM::ARMABI ARM::computeTargetABI(const Triple &TT, StringRef ABIName) { if (ABIName.empty()) - ABIName = ARM::computeDefaultTargetABI(TT, CPU); + ABIName = ARM::computeDefaultTargetABI(TT); if (ABIName == "aapcs16") return ARM_ABI_AAPCS16; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 126be71..19a16ea 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -379,6 +379,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fadd-rtn-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; Features["atomic-global-pk-add-bf16-inst"] = true; + Features["bf16-trans-insts"] = true; + Features["bf16-cvt-insts"] = true; Features["bf8-cvt-scale-insts"] = true; Features["bitop3-insts"] = true; Features["ci-insts"] = true; @@ -401,9 +403,10 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["f32-to-f16bf16-cvt-sr-insts"] = true; Features["fp4-cvt-scale-insts"] = true; Features["fp6bf6-cvt-scale-insts"] = true; - Features["fp8-insts"] = true; + Features["fp8e5m3-insts"] = true; Features["fp8-conversion-insts"] = true; Features["fp8-cvt-scale-insts"] = true; + Features["fp8-insts"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; Features["gfx90a-insts"] = true; @@ -413,17 +416,23 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; Features["gfx12-insts"] = true; + Features["gfx1250-insts"] = true; Features["gws"] = true; Features["image-insts"] = true; - Features["s-memrealtime"] = true; - Features["s-memtime-inst"] = true; Features["mai-insts"] = true; Features["permlane16-swap"] = true; Features["permlane32-swap"] = true; Features["prng-inst"] = true; + Features["setprio-inc-wg-inst"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["tanh-insts"] = true; + Features["tensor-cvt-lut-insts"] = true; + Features["transpose-load-f4f6-insts"] = true; + Features["vmem-pref-insts"] = true; + Features["vmem-to-lds-load-insts"] = true; Features["wavefrontsize32"] = true; Features["wavefrontsize64"] = true; - Features["vmem-to-lds-load-insts"] = true; } else if (T.isAMDGCN()) { AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); switch (Kind) { @@ -444,6 +453,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["bitop3-insts"] = true; Features["prng-inst"] = true; Features["tanh-insts"] = true; + Features["tensor-cvt-lut-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["bf16-cvt-insts"] = true; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index b268fea..cf94d28 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -113,10 +113,16 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() || - GV->getValueType() != GEP->getSourceElementType() || !GV->isConstant() || + !GV->getValueType()->isArrayTy() || !GV->isConstant() || !GV->hasDefinitiveInitializer()) return nullptr; + Type *GEPSrcEltTy = GEP->getSourceElementType(); + if (GEPSrcEltTy->isArrayTy()) + GEPSrcEltTy = GEPSrcEltTy->getArrayElementType(); + if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy) + return nullptr; + Constant *Init = GV->getInitializer(); if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) return nullptr; @@ -127,12 +133,19 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( return nullptr; // There are many forms of this optimization we can handle, for now, just do - // the simple index into a single-dimensional array. + // the simple index into a single-dimensional array or elements of equal size. // - // Require: GEP GV, 0, i {{, constant indices}} - if (GEP->getNumOperands() < 3 || !isa<ConstantInt>(GEP->getOperand(1)) || - !cast<ConstantInt>(GEP->getOperand(1))->isZero() || - isa<Constant>(GEP->getOperand(2))) + // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}} + // Or: GEP i8 GV, Idx + + unsigned GEPIdxOp = 1; + if (GEP->getSourceElementType()->isArrayTy()) { + GEPIdxOp = 2; + if (!match(GEP->getOperand(1), m_ZeroInt())) + return nullptr; + } + if (GEP->getNumOperands() < GEPIdxOp + 1 || + isa<Constant>(GEP->getOperand(GEPIdxOp))) return nullptr; // Check that indices after the variable are constants and in-range for the @@ -141,7 +154,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( SmallVector<unsigned, 4> LaterIndices; Type *EltTy = Init->getType()->getArrayElementType(); - for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { + for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) { ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); if (!Idx) return nullptr; // Variable index. @@ -163,7 +176,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( LaterIndices.push_back(IdxVal); } - Value *Idx = GEP->getOperand(2); + Value *Idx = GEP->getOperand(GEPIdxOp); // If the index type is non-canonical, wait for it to be canonicalized. if (Idx->getType() != DL.getIndexType(GEP->getType())) return nullptr; @@ -6077,7 +6090,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) { match(Op1, m_OneUse(m_LShr(m_Value(B), m_APIntAllowPoison(AP2))))) || (match(Op0, m_OneUse(m_AShr(m_Value(A), m_APIntAllowPoison(AP1)))) && match(Op1, m_OneUse(m_AShr(m_Value(B), m_APIntAllowPoison(AP2)))))) { - if (AP1 != AP2) + if (*AP1 != *AP2) return nullptr; unsigned TypeBits = AP1->getBitWidth(); unsigned ShAmt = AP1->getLimitedValue(TypeBits); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 0e3436d..f17fecd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1834,14 +1834,17 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, // segfaults which didn't exist in the original program. APInt DemandedPtrs(APInt::getAllOnes(VWidth)), DemandedPassThrough(DemandedElts); - if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2))) + if (auto *CMask = dyn_cast<Constant>(II->getOperand(2))) { for (unsigned i = 0; i < VWidth; i++) { - Constant *CElt = CV->getAggregateElement(i); - if (CElt->isNullValue()) - DemandedPtrs.clearBit(i); - else if (CElt->isAllOnesValue()) - DemandedPassThrough.clearBit(i); + if (Constant *CElt = CMask->getAggregateElement(i)) { + if (CElt->isNullValue()) + DemandedPtrs.clearBit(i); + else if (CElt->isAllOnesValue()) + DemandedPassThrough.clearBit(i); + } } + } + if (II->getIntrinsicID() == Intrinsic::masked_gather) simplifyAndSetOp(II, 0, DemandedPtrs, PoisonElts2); simplifyAndSetOp(II, 3, DemandedPassThrough, PoisonElts3); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index e87bee7..8da65c5 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1222,9 +1222,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) return; // Find alloca instruction that corresponds to llvm.lifetime argument. - AllocaInst *AI = cast<AllocaInst>(II.getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1)); // We're interested only in allocas we can handle. - if (!ASan.isInterestingAlloca(*AI)) + if (!AI || !ASan.isInterestingAlloca(*AI)) return; bool DoPoison = (ID == Intrinsic::lifetime_end); AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 54d9a83..7d3c940 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3301,8 +3301,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; - AllocaInst *AI = cast<AllocaInst>(I.getArgOperand(1)); - LifetimeStartList.push_back(std::make_pair(&I, AI)); + AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1)); + if (AI) + LifetimeStartList.push_back(std::make_pair(&I, AI)); } void handleBswap(IntrinsicInst &I) { diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index fa1db28..d18c0d0 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -468,7 +468,8 @@ private: // Floating-point constants. Type *Ty = Config.getExtendedFPType(CFP->getType()); return ConstantFP::get( - Ty, extendConstantFP(CFP->getValueAPF(), Ty->getFltSemantics())); + Ty, extendConstantFP(CFP->getValueAPF(), + Ty->getScalarType()->getFltSemantics())); } // Vector, array, or aggregate constants. if (C->getType()->isVectorTy()) { diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index f6bf09d..7704e49 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2499,9 +2499,13 @@ void GVNPass::assignBlockRPONumber(Function &F) { bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const { bool Changed = false; for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { - Value *Operand = Instr->getOperand(OpNum); - auto It = ReplaceOperandsWithMap.find(Operand); + Use &Operand = Instr->getOperandUse(OpNum); + auto It = ReplaceOperandsWithMap.find(Operand.get()); if (It != ReplaceOperandsWithMap.end()) { + const DataLayout &DL = Instr->getDataLayout(); + if (!canReplacePointersInUseIfEqual(Operand, It->second, DL)) + continue; + LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *It->second << " in instruction " << *Instr << '\n'); Instr->setOperand(OpNum, It->second); @@ -2679,6 +2683,11 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, Worklist.emplace_back(A, ConstantInt::get(A->getType(), IsKnownTrue)); continue; } + + if (match(LHS, m_Not(m_Value(A)))) { + Worklist.emplace_back(A, ConstantInt::get(A->getType(), !IsKnownTrue)); + continue; + } } return Changed; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a5fc0b4..1c88532 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -308,7 +308,7 @@ public: for (auto &U : I->uses()) op_push_back(U.getUser()); - llvm::sort(op_begin(), op_end()); + llvm::sort(operands()); } void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 6bdf76f..a883998 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -117,7 +117,7 @@ struct StoreToLoadForwardingCandidate { if (std::abs(StrideLoad) != 1) return false; - unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); + unsigned TypeByteSize = DL.getTypeAllocSize(LoadType); auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 23256cf..03d9f32 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1247,8 +1247,7 @@ private: "Map index doesn't point back to a slice with this user."); } - // Disable SRoA for any intrinsics except for lifetime invariants and - // invariant group. + // Disable SRoA for any intrinsics except for lifetime invariants. // FIXME: What about debug intrinsics? This matches old behavior, but // doesn't make sense. void visitIntrinsicInst(IntrinsicInst &II) { @@ -1268,12 +1267,6 @@ private: return; } - if (II.isLaunderOrStripInvariantGroup()) { - insertUse(II, Offset, AllocSize, true); - enqueueUsers(II); - return; - } - Base::visitIntrinsicInst(II); } @@ -3607,8 +3600,7 @@ private: } bool visitIntrinsicInst(IntrinsicInst &II) { - assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() || - II.isDroppable()) && + assert((II.isLifetimeStartOrEnd() || II.isDroppable()) && "Unexpected intrinsic!"); LLVM_DEBUG(dbgs() << " original: " << II << "\n"); @@ -3622,9 +3614,6 @@ private: return true; } - if (II.isLaunderOrStripInvariantGroup()) - return true; - assert(II.getArgOperand(1) == OldPtr); // Lifetime intrinsics are only promotable if they cover the whole alloca. // Therefore, we drop lifetime intrinsics which don't cover the whole diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index a4fa0e2..e411d68 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -84,6 +84,7 @@ add_llvm_component_library(LLVMTransformUtils SimplifyLibCalls.cpp SizeOpts.cpp SplitModule.cpp + SplitModuleByCategory.cpp StripNonLineTableDebugInfo.cpp SymbolRewriter.cpp UnifyFunctionExitNodes.cpp diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index babd7f6..2619e73 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -482,6 +482,9 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I, if (II->isLifetimeStartOrEnd()) { auto *Arg = II->getArgOperand(1); + if (isa<PoisonValue>(Arg)) + return true; + // If the only uses of the alloca are lifetime intrinsics, then the // intrinsics are dead. return llvm::all_of(Arg->uses(), [](Use &Use) { @@ -3180,9 +3183,8 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { combineMetadataForCSE(ReplInst, I, false); } -template <typename RootType, typename ShouldReplaceFn> +template <typename ShouldReplaceFn> static unsigned replaceDominatedUsesWith(Value *From, Value *To, - const RootType &Root, const ShouldReplaceFn &ShouldReplace) { assert(From->getType() == To->getType()); @@ -3191,7 +3193,7 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To, auto *II = dyn_cast<IntrinsicInst>(U.getUser()); if (II && II->getIntrinsicID() == Intrinsic::fake_use) continue; - if (!ShouldReplace(Root, U)) + if (!ShouldReplace(U)) continue; LLVM_DEBUG(dbgs() << "Replace dominated use of '"; From->printAsOperand(dbgs()); @@ -3220,39 +3222,33 @@ unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) { unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root) { - auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) { - return DT.dominates(Root, U); - }; - return ::replaceDominatedUsesWith(From, To, Root, Dominates); + auto Dominates = [&](const Use &U) { return DT.dominates(Root, U); }; + return ::replaceDominatedUsesWith(From, To, Dominates); } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlock *BB) { - auto Dominates = [&DT](const BasicBlock *BB, const Use &U) { - return DT.dominates(BB, U); - }; - return ::replaceDominatedUsesWith(From, To, BB, Dominates); + auto Dominates = [&](const Use &U) { return DT.dominates(BB, U); }; + return ::replaceDominatedUsesWith(From, To, Dominates); } unsigned llvm::replaceDominatedUsesWithIf( Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root, function_ref<bool(const Use &U, const Value *To)> ShouldReplace) { - auto DominatesAndShouldReplace = - [&DT, &ShouldReplace, To](const BasicBlockEdge &Root, const Use &U) { - return DT.dominates(Root, U) && ShouldReplace(U, To); - }; - return ::replaceDominatedUsesWith(From, To, Root, DominatesAndShouldReplace); + auto DominatesAndShouldReplace = [&](const Use &U) { + return DT.dominates(Root, U) && ShouldReplace(U, To); + }; + return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace); } unsigned llvm::replaceDominatedUsesWithIf( Value *From, Value *To, DominatorTree &DT, const BasicBlock *BB, function_ref<bool(const Use &U, const Value *To)> ShouldReplace) { - auto DominatesAndShouldReplace = [&DT, &ShouldReplace, - To](const BasicBlock *BB, const Use &U) { + auto DominatesAndShouldReplace = [&](const Use &U) { return DT.dominates(BB, U) && ShouldReplace(U, To); }; - return ::replaceDominatedUsesWith(From, To, BB, DominatesAndShouldReplace); + return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace); } bool llvm::callsGCLeafFunction(const CallBase *Call, diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index bea76d3..472c03f 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -155,8 +155,9 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE, return; } if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) { - AllocaInst *AI = cast<AllocaInst>(II->getArgOperand(1)); - if (getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting) + AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + if (!AI || + getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting) return; if (II->getIntrinsicID() == Intrinsic::lifetime_start) Info.AllocasToInstrument[AI].LifetimeStart.push_back(II); diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index de9deab..b22ecbc 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -721,7 +721,7 @@ PredicateInfo::~PredicateInfo() { CreatedDeclarations.clear(); for (Function *F : FunctionPtrs) { - assert(F->user_begin() == F->user_end() && + assert(F->users().empty() && "PredicateInfo consumer did not remove all SSA copies."); F->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index b972132..d67192f 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -20,8 +20,12 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; +static cl::opt<int64_t> + DefaultFunctionEntryCount("profcheck-default-function-entry-count", + cl::init(1000)); namespace { class ProfileInjector { Function &F; @@ -63,6 +67,19 @@ bool ProfileInjector::inject() { // will get the same BPI it does if the injector wasn't running. auto &BPI = FAM.getResult<BranchProbabilityAnalysis>(F); + // Inject a function count if there's none. It's reasonable for a pass to + // want to clear the MD_prof of a function with zero entry count. If the + // original profile (iFDO or AFDO) is empty for a function, it's simpler to + // require assigning it the 0-entry count explicitly than to mark every branch + // as cold (we do want some explicit information in the spirit of what this + // verifier wants to achieve - make dropping / corrupting MD_prof + // unit-testable) + if (!F.getEntryCount(/*AllowSynthetic=*/true)) + F.setEntryCount(DefaultFunctionEntryCount); + // If there is an entry count that's 0, then don't bother injecting. We won't + // verify these either. + if (F.getEntryCount(/*AllowSynthetic=*/true)->getCount() == 0) + return false; bool Changed = false; for (auto &BB : F) { auto *Term = getTerminatorBenefitingFromMDProf(BB); @@ -119,11 +136,20 @@ PreservedAnalyses ProfileInjectorPass::run(Function &F, PreservedAnalyses ProfileVerifierPass::run(Function &F, FunctionAnalysisManager &FAM) { + const auto EntryCount = F.getEntryCount(/*AllowSynthetic=*/true); + if (!EntryCount) { + F.getContext().emitError("Profile verification failed: function entry " + "count missing (set to 0 if cold)"); + return PreservedAnalyses::all(); + } + if (EntryCount->getCount() == 0) + return PreservedAnalyses::all(); for (const auto &BB : F) if (const auto *Term = ProfileInjector::getTerminatorBenefitingFromMDProf(BB)) if (!Term->getMetadata(LLVMContext::MD_prof)) - F.getContext().emitError("Profile verification failed"); + F.getContext().emitError( + "Profile verification failed: branch annotation missing"); - return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 674de57..deabacc 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6641,16 +6641,20 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, } case ArrayKind: { Type *IndexTy = DL.getIndexType(Array->getType()); + auto *ArrayTy = cast<ArrayType>(Array->getValueType()); - if (Index->getType() != IndexTy) + if (Index->getType() != IndexTy) { + unsigned OldBitWidth = Index->getType()->getIntegerBitWidth(); Index = Builder.CreateZExtOrTrunc(Index, IndexTy); + if (auto *Zext = dyn_cast<ZExtInst>(Index)) + Zext->setNonNeg( + isUIntN(OldBitWidth - 1, ArrayTy->getNumElements() - 1)); + } Value *GEPIndices[] = {ConstantInt::get(IndexTy, 0), Index}; - Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array, - GEPIndices, "switch.gep"); - return Builder.CreateLoad( - cast<ArrayType>(Array->getValueType())->getElementType(), GEP, - "switch.load"); + Value *GEP = + Builder.CreateInBoundsGEP(ArrayTy, Array, GEPIndices, "switch.gep"); + return Builder.CreateLoad(ArrayTy->getElementType(), GEP, "switch.load"); } } llvm_unreachable("Unknown lookup table kind!"); diff --git a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp new file mode 100644 index 0000000..6b18ece --- /dev/null +++ b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp @@ -0,0 +1,323 @@ +//===-------- SplitModuleByCategory.cpp - split a module by categories ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See comments in the header. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SplitModuleByCategory.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Cloning.h" + +#include <map> +#include <string> +#include <utility> + +using namespace llvm; + +#define DEBUG_TYPE "split-module-by-category" + +namespace { + +// A vector that contains a group of function with the same category. +using EntryPointSet = SetVector<const Function *>; + +/// Represents a group of functions with one category. +struct EntryPointGroup { + int ID; + EntryPointSet Functions; + + EntryPointGroup() = default; + + EntryPointGroup(int ID, EntryPointSet &&Functions = EntryPointSet()) + : ID(ID), Functions(std::move(Functions)) {} + + void clear() { Functions.clear(); } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { + constexpr size_t INDENT = 4; + dbgs().indent(INDENT) << "ENTRY POINTS" + << " " << ID << " {\n"; + for (const Function *F : Functions) + dbgs().indent(INDENT) << " " << F->getName() << "\n"; + + dbgs().indent(INDENT) << "}\n"; + } +#endif +}; + +/// Annotates an llvm::Module with information necessary to perform and track +/// the result of code (llvm::Module instances) splitting: +/// - entry points group from the module. +class ModuleDesc { + std::unique_ptr<Module> M; + EntryPointGroup EntryPoints; + +public: + ModuleDesc(std::unique_ptr<Module> M, + EntryPointGroup &&EntryPoints = EntryPointGroup()) + : M(std::move(M)), EntryPoints(std::move(EntryPoints)) { + assert(this->M && "Module should be non-null"); + } + + Module &getModule() { return *M; } + const Module &getModule() const { return *M; } + + std::unique_ptr<Module> releaseModule() { + EntryPoints.clear(); + return std::move(M); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { + dbgs() << "ModuleDesc[" << M->getName() << "] {\n"; + EntryPoints.dump(); + dbgs() << "}\n"; + } +#endif +}; + +bool isKernel(const Function &F) { + return F.getCallingConv() == CallingConv::SPIR_KERNEL || + F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::PTX_Kernel; +} + +// Represents "dependency" or "use" graph of global objects (functions and +// global variables) in a module. It is used during code split to +// understand which global variables and functions (other than entry points) +// should be included into a split module. +// +// Nodes of the graph represent LLVM's GlobalObjects, edges "A" -> "B" represent +// the fact that if "A" is included into a module, then "B" should be included +// as well. +// +// Examples of dependencies which are represented in this graph: +// - Function FA calls function FB +// - Function FA uses global variable GA +// - Global variable GA references (initialized with) function FB +// - Function FA stores address of a function FB somewhere +// +// The following cases are treated as dependencies between global objects: +// 1. Global object A is used by a global object B in any way (store, +// bitcast, phi node, call, etc.): "A" -> "B" edge will be added to the +// graph; +// 2. function A performs an indirect call of a function with signature S and +// there is a function B with signature S. "A" -> "B" edge will be added to +// the graph; +class DependencyGraph { +public: + using GlobalSet = SmallPtrSet<const GlobalValue *, 16>; + + DependencyGraph(const Module &M) { + // Group functions by their signature to handle case (2) described above + DenseMap<const FunctionType *, DependencyGraph::GlobalSet> + FuncTypeToFuncsMap; + for (const Function &F : M.functions()) { + // Kernels can't be called (either directly or indirectly). + if (isKernel(F)) + continue; + + FuncTypeToFuncsMap[F.getFunctionType()].insert(&F); + } + + for (const Function &F : M.functions()) { + // case (1), see comment above the class definition + for (const Value *U : F.users()) + addUserToGraphRecursively(cast<const User>(U), &F); + + // case (2), see comment above the class definition + for (const Instruction &I : instructions(F)) { + const CallBase *CB = dyn_cast<CallBase>(&I); + if (!CB || !CB->isIndirectCall()) // Direct calls were handled above + continue; + + const FunctionType *Signature = CB->getFunctionType(); + GlobalSet &PotentialCallees = FuncTypeToFuncsMap[Signature]; + Graph[&F].insert(PotentialCallees.begin(), PotentialCallees.end()); + } + } + + // And every global variable (but their handling is a bit simpler) + for (const GlobalVariable &GV : M.globals()) + for (const Value *U : GV.users()) + addUserToGraphRecursively(cast<const User>(U), &GV); + } + + iterator_range<GlobalSet::const_iterator> + dependencies(const GlobalValue *Val) const { + auto It = Graph.find(Val); + return (It == Graph.end()) + ? make_range(EmptySet.begin(), EmptySet.end()) + : make_range(It->second.begin(), It->second.end()); + } + +private: + void addUserToGraphRecursively(const User *Root, const GlobalValue *V) { + SmallVector<const User *, 8> WorkList; + WorkList.push_back(Root); + + while (!WorkList.empty()) { + const User *U = WorkList.pop_back_val(); + if (const auto *I = dyn_cast<const Instruction>(U)) { + const Function *UFunc = I->getFunction(); + Graph[UFunc].insert(V); + } else if (isa<const Constant>(U)) { + if (const auto *GV = dyn_cast<const GlobalVariable>(U)) + Graph[GV].insert(V); + // This could be a global variable or some constant expression (like + // bitcast or gep). We trace users of this constant further to reach + // global objects they are used by and add them to the graph. + for (const User *UU : U->users()) + WorkList.push_back(UU); + } else { + llvm_unreachable("Unhandled type of function user"); + } + } + } + + DenseMap<const GlobalValue *, GlobalSet> Graph; + SmallPtrSet<const GlobalValue *, 1> EmptySet; +}; + +void collectFunctionsAndGlobalVariablesToExtract( + SetVector<const GlobalValue *> &GVs, const Module &M, + const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &DG) { + // We start with module entry points + for (const Function *F : ModuleEntryPoints.Functions) + GVs.insert(F); + + // Non-discardable global variables are also include into the initial set + for (const GlobalVariable &GV : M.globals()) + if (!GV.isDiscardableIfUnused()) + GVs.insert(&GV); + + // GVs has SetVector type. This type inserts a value only if it is not yet + // present there. So, recursion is not expected here. + size_t Idx = 0; + while (Idx < GVs.size()) { + const GlobalValue *Obj = GVs[Idx++]; + + for (const GlobalValue *Dep : DG.dependencies(Obj)) { + if (const auto *Func = dyn_cast<const Function>(Dep)) { + if (!Func->isDeclaration()) + GVs.insert(Func); + } else { + GVs.insert(Dep); // Global variables are added unconditionally + } + } + } +} + +ModuleDesc extractSubModule(const Module &M, + const SetVector<const GlobalValue *> &GVs, + EntryPointGroup &&ModuleEntryPoints) { + ValueToValueMapTy VMap; + // Clone definitions only for needed globals. Others will be added as + // declarations and removed later. + std::unique_ptr<Module> SubM = CloneModule( + M, VMap, [&](const GlobalValue *GV) { return GVs.contains(GV); }); + // Replace entry points with cloned ones. + EntryPointSet NewEPs; + const EntryPointSet &EPs = ModuleEntryPoints.Functions; + llvm::for_each( + EPs, [&](const Function *F) { NewEPs.insert(cast<Function>(VMap[F])); }); + ModuleEntryPoints.Functions = std::move(NewEPs); + return ModuleDesc{std::move(SubM), std::move(ModuleEntryPoints)}; +} + +// The function produces a copy of input LLVM IR module M with only those +// functions and globals that can be called from entry points that are specified +// in ModuleEntryPoints vector, in addition to the entry point functions. +ModuleDesc extractCallGraph(const Module &M, + EntryPointGroup &&ModuleEntryPoints, + const DependencyGraph &DG) { + SetVector<const GlobalValue *> GVs; + collectFunctionsAndGlobalVariablesToExtract(GVs, M, ModuleEntryPoints, DG); + + ModuleDesc SplitM = extractSubModule(M, GVs, std::move(ModuleEntryPoints)); + LLVM_DEBUG(SplitM.dump()); + return SplitM; +} + +using EntryPointGroupVec = SmallVector<EntryPointGroup>; + +/// Module Splitter. +/// It gets a module and a collection of entry points groups. +/// Each group specifies subset entry points from input module that should be +/// included in a split module. +class ModuleSplitter { +private: + std::unique_ptr<Module> M; + EntryPointGroupVec Groups; + DependencyGraph DG; + +private: + EntryPointGroup drawEntryPointGroup() { + assert(Groups.size() > 0 && "Reached end of entry point groups list."); + EntryPointGroup Group = std::move(Groups.back()); + Groups.pop_back(); + return Group; + } + +public: + ModuleSplitter(std::unique_ptr<Module> Module, EntryPointGroupVec &&GroupVec) + : M(std::move(Module)), Groups(std::move(GroupVec)), DG(*M) { + assert(!Groups.empty() && "Entry points groups collection is empty!"); + } + + /// Gets next subsequence of entry points in an input module and provides + /// split submodule containing these entry points and their dependencies. + ModuleDesc getNextSplit() { + return extractCallGraph(*M, drawEntryPointGroup(), DG); + } + + /// Check that there are still submodules to split. + bool hasMoreSplits() const { return Groups.size() > 0; } +}; + +EntryPointGroupVec selectEntryPointGroups( + const Module &M, function_ref<std::optional<int>(const Function &F)> EPC) { + // std::map is used here to ensure stable ordering of entry point groups, + // which is based on their contents, this greatly helps LIT tests + // Note: EPC is allowed to return big identifiers. Therefore, we use + // std::map + SmallVector approach here. + std::map<int, EntryPointSet> EntryPointsMap; + + for (const auto &F : M.functions()) + if (std::optional<int> Category = EPC(F); Category) + EntryPointsMap[*Category].insert(&F); + + EntryPointGroupVec Groups; + Groups.reserve(EntryPointsMap.size()); + for (auto &[Key, EntryPoints] : EntryPointsMap) + Groups.emplace_back(Key, std::move(EntryPoints)); + + return Groups; +} + +} // namespace + +void llvm::splitModuleTransitiveFromEntryPoints( + std::unique_ptr<Module> M, + function_ref<std::optional<int>(const Function &F)> EntryPointCategorizer, + function_ref<void(std::unique_ptr<Module> Part)> Callback) { + EntryPointGroupVec Groups = selectEntryPointGroups(*M, EntryPointCategorizer); + ModuleSplitter Splitter(std::move(M), std::move(Groups)); + while (Splitter.hasMoreSplits()) { + ModuleDesc MD = Splitter.getNextSplit(); + Callback(MD.releaseModule()); + } +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ea0fa06..912c893 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -263,6 +263,13 @@ public: new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, GEPNoWrapFlags::inBounds(), DL, Name)); } + VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + return tryInsertInstruction( + new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset}, + GEPNoWrapFlags::none(), DL, Name)); + } VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL, const Twine &Name = "") { @@ -486,6 +493,13 @@ public: /// all profitable VFs in ProfitableVFs. VectorizationFactor computeBestVF(); + /// \return The desired interleave count. + /// If interleave count has been specified by metadata it will be returned. + /// Otherwise, the interleave count is computed and returned. VF and LoopCost + /// are the selected vectorization factor and the cost of the selected VF. + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost); + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b4ea70e..eb0e0fd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -974,13 +974,6 @@ public: /// 64 bit loop indices. std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); - /// \return The desired interleave count. - /// If interleave count has been specified by metadata it will be returned. - /// Otherwise, the interleave count is computed and returned. VF and LoopCost - /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost); - /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. /// This function takes cost-based decisions for Load/Store instructions @@ -1590,7 +1583,7 @@ private: /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; + using ScalarCostsTy = MapVector<Instruction *, InstructionCost>; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -4653,8 +4646,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } unsigned -LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost) { +LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -4669,11 +4662,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!isScalarEpilogueAllowed()) + if (!CM.isScalarEpilogueAllowed()) return 1; - // Do not interleave if EVL is preferred and no User IC is specified. - if (foldTailWithEVL()) { + if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred<VPEVLBasedIVPHIRecipe>)) { LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " "Unroll factor forced to be 1.\n"); return 1; @@ -4686,15 +4679,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // We don't attempt to perform interleaving for loops with uncountable early // exits because the VPInstruction::AnyOf code cannot currently handle // multiple parts. - if (Legal->hasUncountableEarlyExit()) + if (Plan.hasEarlyExit()) return 1; - const bool HasReductions = !Legal->getReductionVars().empty(); + const bool HasReductions = + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + IsaPred<VPReductionPHIRecipe>); // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - LoopCost = expectedCost(VF); + if (VF.isScalar()) + LoopCost = CM.expectedCost(VF); + else + LoopCost = cost(Plan, VF); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -4703,7 +4701,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, } VPRegisterUsage R = - calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0]; + calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -4766,23 +4764,24 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); + auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop); // For fixed length VFs treat a scalable trip count as unknown. if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { // Re-evaluate trip counts and VFs to be in the same numerical space. - unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); - unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); + unsigned AvailableTC = + estimateElementCount(*BestKnownTC, CM.getVScaleForTuning()); + unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning()); // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - if (requiresScalarEpilogue(VF.isVector())) + if (CM.requiresScalarEpilogue(VF.isVector())) --AvailableTC; unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); - if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) { + if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) { // If the best known trip count is exact, we select between two // prospective ICs, where // @@ -4843,7 +4842,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = @@ -4866,8 +4865,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Interleave until store/load ports (estimated by max interleave count) are // saturated. - unsigned NumStores = Legal->getNumStores(); - unsigned NumLoads = Legal->getNumLoads(); + unsigned NumStores = 0; + unsigned NumLoads = 0; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) { + NumLoads++; + continue; + } + if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) { + NumStores++; + continue; + } + + if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) { + if (unsigned StoreOps = InterleaveR->getNumStoreOperands()) + NumStores += StoreOps; + else + NumLoads += InterleaveR->getNumDefinedValues(); + continue; + } + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { + NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr()); + NumStores += isa<StoreInst>(RepR->getUnderlyingInstr()); + continue; + } + if (isa<VPHistogramRecipe>(&R)) { + NumLoads++; + NumStores++; + continue; + } + } + } unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); @@ -4877,12 +4907,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // do the final reduction after the loop. bool HasSelectCmpReductions = HasReductions && - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - RecurKind RK = RdxDesc.getRecurrenceKind(); - return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindIVRecurrenceKind(RK); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R); + return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RedR->getRecurrenceKind()) || + RecurrenceDescriptor::isFindIVRecurrenceKind( + RedR->getRecurrenceKind())); + }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); return 1; @@ -4893,12 +4925,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // we're interleaving is inside another loop. For tree-wise reductions // set the limit to 2, and for ordered reductions it's best to disable // interleaving entirely. - if (HasReductions && TheLoop->getLoopDepth() > 1) { + if (HasReductions && OrigLoop->getLoopDepth() > 1) { bool HasOrderedReductions = - any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { - const RecurrenceDescriptor &RdxDesc = Reduction.second; - return RdxDesc.isOrdered(); - }); + any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), + [](VPRecipeBase &R) { + auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R); + + return RedR && RedR->isOrdered(); + }); if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); @@ -4992,7 +5026,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { - ScalarCostsVF.insert_range(ScalarCosts); + for (const auto &[I, IC] : ScalarCosts) + ScalarCostsVF.insert({I, IC}); // Check if we decided to scalarize a call. If so, update the widening // decision of the call to CM_Scalarize with the computed scalar cost. for (const auto &[I, Cost] : ScalarCosts) { @@ -7302,6 +7337,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); + VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); VPlanTransforms::removeBranchOnConst(BestVPlan); @@ -7317,6 +7353,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::dissolveLoopRegions(BestVPlan); // Canonicalize EVL loops after regions are dissolved. VPlanTransforms::canonicalizeEVLLoops(BestVPlan); + VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH); + // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, OrigLoop->getParentLoop(), @@ -7373,7 +7411,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute( - ILV.getTripCount(), ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); @@ -10119,7 +10156,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. - IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); + IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 593868f..62ab3f52 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -991,6 +991,17 @@ class BinOpSameOpcodeHelper { return Candidate & OrBIT; case Instruction::Xor: return Candidate & XorBIT; + case Instruction::LShr: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::FDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::FRem: + return false; default: break; } @@ -1238,6 +1249,12 @@ public: BinOpSameOpcodeHelper Converter(MainOp); if (!Converter.add(I) || !Converter.add(MainOp)) return nullptr; + if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) { + BinOpSameOpcodeHelper AltConverter(AltOp); + if (AltConverter.add(I) && AltConverter.add(AltOp) && + AltConverter.hasCandidateOpcode(AltOp->getOpcode())) + return AltOp; + } if (Converter.hasAltOp() && !isAltShuffle()) return nullptr; return Converter.hasAltOp() ? AltOp : MainOp; @@ -1329,7 +1346,7 @@ public: // If the copyable instructions comes after MainOp // (non-schedulable, but used in the block) - cannot vectorize // it, will possibly generate use before def. - (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I))); + !MainOp->comesBefore(I)); }; return IsNonSchedulableCopyableElement(V); @@ -18887,8 +18904,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!UseIntrinsic) { VFShape Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed( - static_cast<unsigned>(VecTy->getNumElements())), + ElementCount::getFixed(VecTy->getNumElements()), false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 25b9616..8052e31 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -951,17 +951,7 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, - VPTransformState &State) { - Type *TCTy = TripCountV->getType(); - // Check if the backedge taken count is needed, and if so build it. - if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - auto *TCMO = Builder.CreateSub(TripCountV, ConstantInt::get(TCTy, 1), - "trip.count.minus.1"); - BackedgeTakenCount->setUnderlyingValue(TCMO); - } - +void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) { if (!VectorTripCount.getUnderlyingValue()) VectorTripCount.setUnderlyingValue(VectorTripCountV); else @@ -969,6 +959,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, "VectorTripCount set earlier must much VectorTripCountV"); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + Type *TCTy = VectorTripCountV->getType(); // FIXME: Model VF * UF computation completely in VPlan. unsigned UF = getUF(); if (VF.getNumUsers()) { @@ -1047,21 +1038,6 @@ void VPlan::execute(VPTransformState *State) { if (isa<VPWidenPHIRecipe>(&R)) continue; - if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi)); - PHINode *Phi = cast<PHINode>(GEP->getPointerOperand()); - - Phi->setIncomingBlock(1, VectorLatchBB); - - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1)); - Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator())); - continue; - } - auto *PhiR = cast<VPSingleDefRecipe>(&R); // VPInstructions currently model scalar Phis only. bool NeedsScalar = isa<VPInstruction>(PhiR) || diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a5de593..8dfb982 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -991,6 +991,9 @@ public: // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // Add a vector offset in bytes (second operand) to a scalar base pointer + // (first operand). + WidePtrAdd, // Returns a scalar boolean value, which is true if any lane of its // (boolean) vector operands is true. It produces the reduced value across // all unrolled iterations. Unrolling will add all copies of its original @@ -1979,6 +1982,9 @@ public: /// Update the step value of the recipe. void setStepValue(VPValue *V) { setOperand(1, V); } + VPValue *getVFValue() { return getOperand(2); } + const VPValue *getVFValue() const { return getOperand(2); } + /// Returns the number of incoming values, also number of incoming blocks. /// Note that at the moment, VPWidenPointerInductionRecipe only has a single /// incoming value, its start value. @@ -2068,9 +2074,6 @@ public: VPSlotTracker &SlotTracker) const override; #endif - VPValue *getVFValue() { return getOperand(2); } - const VPValue *getVFValue() const { return getOperand(2); } - VPValue *getSplatVFValue() { // If the recipe has been unrolled return the VPValue for the induction // increment. @@ -2106,8 +2109,7 @@ public: } }; -class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe, - public VPUnrollPartAccessor<4> { +class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe { bool IsScalarAfterVectorization; public: @@ -2136,18 +2138,14 @@ public: VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) /// Generate vector values for the pointer induction. - void execute(VPTransformState &State) override; + void execute(VPTransformState &State) override { + llvm_unreachable("cannot execute this recipe, should be expanded via " + "expandVPWidenPointerInduction"); + }; /// Returns true if only scalar values will be generated. bool onlyScalarsGenerated(bool IsScalable); - /// Returns the VPValue representing the value of this induction at - /// the first unrolled part, if it exists. Returns itself if unrolling did not - /// take place. - VPValue *getFirstUnrolledPartOperand() { - return getUnrollPart(*this) == 0 ? this : getOperand(3); - } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -3958,8 +3956,7 @@ public: } /// Prepare the plan for execution, setting up the required live-in values. - void prepareToExecute(Value *TripCount, Value *VectorTripCount, - VPTransformState &State); + void prepareToExecute(Value *VectorTripCount, VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); @@ -4133,6 +4130,18 @@ public: return It->second; } + /// Return a VPValue wrapping i1 true. + VPValue *getTrue() { + LLVMContext &Ctx = getContext(); + return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + } + + /// Return a VPValue wrapping i1 false. + VPValue *getFalse() { + LLVMContext &Ctx = getContext(); + return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + } + /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } @@ -4229,7 +4238,10 @@ public: /// block with multiple predecessors (one for the exit via the latch and one /// via the other early exit). bool hasEarlyExit() const { - return ExitBlocks.size() > 1 || + return count_if(ExitBlocks, + [](VPIRBasicBlock *EB) { + return EB->getNumPredecessors() != 0; + }) > 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 16072f2..4c3cdda 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -128,6 +128,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return IntegerType::get(Ctx, 1); case VPInstruction::Broadcast: case VPInstruction::PtrAdd: + case VPInstruction::WidePtrAdd: // Return the type based on first operand. return inferScalarType(R->getOperand(0)); case VPInstruction::BranchOnCond: diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h index b77aa9d..c79485c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h +++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h @@ -231,6 +231,13 @@ vp_post_order_shallow(VPBlockBase *G) { } /// Returns an iterator range to traverse the graph starting at \p G in +/// post order while traversing through region blocks. +inline iterator_range<po_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>> +vp_post_order_deep(VPBlockBase *G) { + return post_order(VPBlockDeepTraversalWrapper<VPBlockBase *>(G)); +} + +/// Returns an iterator range to traverse the graph starting at \p G in /// depth-first order while traversing through region blocks. inline iterator_range<df_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>> vp_depth_first_deep(VPBlockBase *G) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 6c1f53b..1b91901 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -570,8 +570,7 @@ void VPlanTransforms::prepareForVectorization( VPBuilder Builder(MiddleVPBB); VPValue *Cmp; if (!RequiresScalarEpilogueCheck) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getFalse(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getFalse(); else if (TailFolded) Cmp = Plan.getOrAddLiveIn( ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); @@ -671,13 +670,12 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { Intrinsic::ID RdxIntrinsicId = RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum : Intrinsic::minnum; - assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) && - cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() == - RdxIntrinsicId) || - (RepR && - cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() == - RdxIntrinsicId) && - "Intrinsic did not match recurrence kind"); + assert(((isa<VPWidenIntrinsicRecipe>(MinMaxR) && + cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() == + RdxIntrinsicId) || + (RepR && cast<IntrinsicInst>(RepR->getUnderlyingInstr()) + ->getIntrinsicID() == RdxIntrinsicId)) && + "Intrinsic did not match recurrence kind"); #endif if (MinMaxR->getOperand(0) == RedPhiR) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 11b4677..47a8077 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -478,6 +478,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::PtrAdd: + case VPInstruction::WidePtrAdd: case VPInstruction::WideIVStep: return 2; case Instruction::Select: @@ -858,6 +859,12 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Addend = State.get(getOperand(1), VPLane(0)); return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } + case VPInstruction::WidePtrAdd: { + Value *Ptr = + State.get(getOperand(0), vputils::isSingleScalar(getOperand(0))); + Value *Addend = State.get(getOperand(1)); + return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); + } case VPInstruction::AnyOf: { Value *Res = State.get(getOperand(0)); for (VPValue *Op : drop_begin(operands())) @@ -1085,6 +1092,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::Not: case VPInstruction::PtrAdd: case VPInstruction::WideIVStep: + case VPInstruction::WidePtrAdd: case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: return false; @@ -1123,6 +1131,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return true; case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); + case VPInstruction::WidePtrAdd: + return Op == getOperand(0); case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); @@ -1231,6 +1241,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::WidePtrAdd: + O << "wide-ptradd"; + break; case VPInstruction::AnyOf: O << "any-of"; break; @@ -1817,7 +1830,8 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::AShr; case OperationType::GEPOp: return Opcode == Instruction::GetElementPtr || - Opcode == VPInstruction::PtrAdd; + Opcode == VPInstruction::PtrAdd || + Opcode == VPInstruction::WidePtrAdd; case OperationType::FPMathOp: return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FSub || Opcode == Instruction::FNeg || @@ -2836,12 +2850,12 @@ static void scalarizeInstruction(const Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) { Cloned->setName(Instr->getName() + ".cloned"); -#if !defined(NDEBUG) - // Verify that VPlan type inference results agree with the type of the - // generated values. - assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && - "inferred type and type from generated instructions do not match"); -#endif + Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe); + // The operands of the replicate recipe may have been narrowed, resulting in + // a narrower result type. Update the type of the cloned instruction to the + // correct type. + if (ResultTy != Cloned->getType()) + Cloned->mutateType(ResultTy); } RepRecipe->applyFlags(*Cloned); @@ -3682,87 +3696,6 @@ bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { (!IsScalable || vputils::onlyFirstLaneUsed(this)); } -void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { - assert(getInductionDescriptor().getKind() == - InductionDescriptor::IK_PtrInduction && - "Not a pointer induction according to InductionDescriptor!"); - assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && - "Unexpected type."); - assert(!onlyScalarsGenerated(State.VF.isScalable()) && - "Recipe should have been replaced"); - - unsigned CurrentPart = getUnrollPart(*this); - - // Build a pointer phi - Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); - Type *ScStValueType = ScalarStartValue->getType(); - - BasicBlock *VectorPH = - State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - PHINode *NewPointerPhi = nullptr; - if (CurrentPart == 0) { - IRBuilder<>::InsertPointGuard Guard(State.Builder); - if (State.Builder.GetInsertPoint() != - State.Builder.GetInsertBlock()->getFirstNonPHIIt()) - State.Builder.SetInsertPoint( - State.Builder.GetInsertBlock()->getFirstNonPHIIt()); - NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi"); - NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); - NewPointerPhi->setDebugLoc(getDebugLoc()); - } else { - // The recipe has been unrolled. In that case, fetch the single pointer phi - // shared among all unrolled parts of the recipe. - auto *GEP = - cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); - NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); - } - - // A pointer induction, performed by using a gep - BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); - Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); - Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); - Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); - // Add induction update using an incorrect block temporarily. The phi node - // will be fixed after VPlan execution. Note that at this point the latch - // block cannot be used, as it does not exist yet. - // TODO: Model increment value in VPlan, by turning the recipe into a - // multi-def and a subclass of VPHeaderPHIRecipe. - if (CurrentPart == 0) { - // The recipe represents the first part of the pointer induction. Create the - // GEP to increment the phi across all unrolled parts. - Value *NumUnrolledElems = State.get(getOperand(2), true); - - Value *InductionGEP = GetElementPtrInst::Create( - State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul( - ScalarStepValue, - State.Builder.CreateTrunc(NumUnrolledElems, PhiType)), - "ptr.ind", InductionLoc); - - NewPointerPhi->addIncoming(InductionGEP, VectorPH); - } - - // Create actual address geps that use the pointer phi as base and a - // vectorized version of the step value (<step*0, ..., step*N>) as offset. - Type *VecPhiType = VectorType::get(PhiType, State.VF); - Value *StartOffsetScalar = State.Builder.CreateMul( - RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); - Value *StartOffset = - State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); - // Create a vector of consecutive numbers from zero to VF. - StartOffset = State.Builder.CreateAdd( - StartOffset, State.Builder.CreateStepVector(VecPhiType)); - - assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && - "scalar step must be the same across all parts"); - Value *GEP = State.Builder.CreateGEP( - State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( - State.VF, ScalarStepValue)), - "vector.gep"); - State.set(this, GEP); -} - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -3921,11 +3854,6 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); - // Manually move it with the other PHIs in case PHI recipes above this one - // also inserted non-phi instructions. - // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in - // convertToConcreteRecipes. - VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt()); State.set(this, VecPhi); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fcbc86f..a7965a0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -545,10 +545,8 @@ static bool isDeadRecipe(VPRecipeBase &R) { } void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getEntry()); - - for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_post_order_deep(Plan.getEntry()))) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { @@ -846,8 +844,8 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, if (ScalarTy->isIntegerTy()) return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { - auto *Zero = Plan.getOrAddLiveIn( - ConstantInt::get(Step->getLiveInIRValue()->getType(), 0)); + Type *StepTy = TypeInfo.inferScalarType(Step); + auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), {}, "ind.escape"); @@ -965,6 +963,7 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, RFlags.getGEPNoWrapFlags()); } case VPInstruction::PtrAdd: + case VPInstruction::WidePtrAdd: return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0], Ops[1], cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); @@ -1431,15 +1430,15 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, // count is not conveniently available as SCEV so far, so we compare directly // against the original trip count. This is stricter than necessary, as we // will only return true if the trip count == vector trip count. - // TODO: Use SCEV for vector trip count once available, to cover cases where - // vector trip count == UF * VF, but original trip count != UF * VF. - const SCEV *TripCount = - vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TripCount) && + const SCEV *VectorTripCount = + vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE); + if (isa<SCEVCouldNotCompute>(VectorTripCount)) + VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(VectorTripCount) && "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); - const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); - return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); + const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements); + return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } /// Try to simplify the branch condition of \p Plan. This may restrict the @@ -1504,10 +1503,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, } else { // The vector region contains header phis for which we cannot remove the // loop region yet. - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()}, + Term->getDebugLoc()); ExitingVPBB->appendRecipe(BOC); } @@ -2173,7 +2170,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); VPTypeAnalysis TypeInfo(CanonicalIVType); LLVMContext &Ctx = CanonicalIVType->getContext(); - VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2754,6 +2751,70 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, WidenIVR->replaceAllUsesWith(WidePHI); } +/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the +/// initial value, phi and backedge value. In the following example: +/// +/// <x1> vector loop: { +/// vector.body: +/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf +/// ... +/// EMIT branch-on-count ... +/// } +/// +/// WIDEN-POINTER-INDUCTION will get expanded to: +/// +/// <x1> vector loop: { +/// vector.body: +/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind +/// EMIT %mul = mul %stepvector, %step +/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul +/// ... +/// EMIT %ptr.ind = ptradd %pointer.phi, %vf +/// EMIT branch-on-count ... +/// } +static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, + VPTypeAnalysis &TypeInfo) { + VPlan *Plan = R->getParent()->getPlan(); + VPValue *Start = R->getStartValue(); + VPValue *Step = R->getStepValue(); + VPValue *VF = R->getVFValue(); + + assert(R->getInductionDescriptor().getKind() == + InductionDescriptor::IK_PtrInduction && + "Not a pointer induction according to InductionDescriptor!"); + assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type."); + assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) && + "Recipe should have been replaced"); + + VPBuilder Builder(R); + DebugLoc DL = R->getDebugLoc(); + + // Build a scalar pointer phi. + VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi"); + + // Create actual address geps that use the pointer phi as base and a + // vectorized version of the step value (<step*0, ..., step*N>) as offset. + Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + Type *StepTy = TypeInfo.inferScalarType(Step); + VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy); + Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step}); + VPValue *PtrAdd = Builder.createNaryOp( + VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep"); + R->replaceAllUsesWith(PtrAdd); + + // Create the backedge value for the scalar pointer phi. + Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF), + DL); + VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF}); + + VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); + Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); + VPValue *InductionGEP = + Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind"); + ScalarPtrPhi->addOperand(InductionGEP); +} + void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { // Replace loop regions with explicity CFG. SmallVector<VPRegionBlock *> LoopRegions; @@ -2779,6 +2840,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, continue; } + if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { + expandVPWidenPointerInduction(WidenIVR, TypeInfo); + ToRemove.push_back(WidenIVR); + continue; + } + // Expand VPBlendRecipe into VPInstruction::Select. VPBuilder Builder(&R); if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { @@ -3178,6 +3245,21 @@ void VPlanTransforms::materializeVectorTripCount( Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue()); } +void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, + VPBasicBlock *VectorPH) { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + if (BTC->getNumUsers() == 0) + return; + + VPBuilder Builder(VectorPH, VectorPH->begin()); + auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); + auto *TCMO = Builder.createNaryOp( + Instruction::Sub, + {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); + BTC->replaceAllUsesWith(TCMO); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 880159f..5943684 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -256,6 +256,11 @@ struct VPlanTransforms { unsigned BestUF, PredicatedScalarEvolution &PSE); + /// Materialize the backedge-taken count to be computed explicitly using + /// VPInstructions. + static void materializeBackedgeTakenCount(VPlan &Plan, + VPBasicBlock *VectorPH); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 871e37e..fc072de 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -65,7 +65,7 @@ class UnrollState { /// Unroll a widen induction recipe \p IV. This introduces recipes to compute /// the induction steps for each part. - void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, + void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); VPValue *getConstantVPV(unsigned Part) { @@ -148,7 +148,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { } void UnrollState::unrollWidenInductionByUF( - VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { + VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { VPBasicBlock *PH = cast<VPBasicBlock>( IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); Type *IVTy = TypeInfo.inferScalarType(IV); @@ -159,9 +159,11 @@ void UnrollState::unrollWidenInductionByUF( VPValue *ScalarStep = IV->getStepValue(); VPBuilder Builder(PH); + Type *VectorStepTy = + IVTy->isPointerTy() ? TypeInfo.inferScalarType(ScalarStep) : IVTy; VPInstruction *VectorStep = Builder.createNaryOp( - VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, IVTy, Flags, - IV->getDebugLoc()); + VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, VectorStepTy, + Flags, IV->getDebugLoc()); ToSkip.insert(VectorStep); @@ -169,8 +171,8 @@ void UnrollState::unrollWidenInductionByUF( // remains the header phi. Parts > 0 are computed by adding Step to the // previous part. The header phi recipe will get 2 new operands: the step // value for a single part and the last part, used to compute the backedge - // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = - // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 + // value during VPWidenInductionRecipe::execute. + // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 // %Part.1 = %Part.0 + %VectorStep // %Part.2 = %Part.1 + %VectorStep // %Part.3 = %Part.2 + %VectorStep @@ -179,8 +181,13 @@ void UnrollState::unrollWidenInductionByUF( // again. VPValue *Prev = IV; Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); - unsigned AddOpc = - IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; + unsigned AddOpc; + if (IVTy->isPointerTy()) + AddOpc = VPInstruction::WidePtrAdd; + else if (IVTy->isFloatingPointTy()) + AddOpc = ID.getInductionOpcode(); + else + AddOpc = Instruction::Add; for (unsigned Part = 1; Part != UF; ++Part) { std::string Name = Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; @@ -207,7 +214,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, return; // Generate step vectors for each unrolled part. - if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) { + if (auto *IV = dyn_cast<VPWidenInductionRecipe>(R)) { unrollWidenInductionByUF(IV, InsertPtForPhi); return; } @@ -221,10 +228,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, VPRecipeBase *Copy = R->clone(); Copy->insertBefore(*R->getParent(), InsertPt); addRecipeForPart(R, Copy, Part); - if (isa<VPWidenPointerInductionRecipe>(R)) { - Copy->addOperand(R); - Copy->addOperand(getConstantVPV(Part)); - } else if (RdxPhi) { + if (RdxPhi) { // If the start value is a ReductionStartVector, use the identity value // (second operand) for unrolled parts. If the scaling factor is > 1, // create a new ReductionStartVector with the scale factor and both @@ -450,8 +454,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { Unroller.remapOperand(&H, 1, UF - 1); continue; } - if (Unroller.contains(H.getVPSingleValue()) || - isa<VPWidenPointerInductionRecipe>(&H)) { + if (Unroller.contains(H.getVPSingleValue())) { Part = 1; continue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 81bd21b..14f20c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -73,8 +73,11 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { } const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { - if (V->isLiveIn()) - return SE.getSCEV(V->getLiveInIRValue()); + if (V->isLiveIn()) { + if (Value *LiveIn = V->getLiveInIRValue()) + return SE.getSCEV(LiveIn); + return SE.getCouldNotCompute(); + } // TODO: Support constructing SCEVs for more recipes as needed. return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe()) |