aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp7
-rw-r--r--llvm/lib/Analysis/Loads.cpp2
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp76
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp66
-rw-r--r--llvm/lib/CodeGen/RegAllocBase.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp106
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp4
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp2
-rw-r--r--llvm/lib/Frontend/Offloading/CMakeLists.txt1
-rw-r--r--llvm/lib/Frontend/Offloading/PropertySet.cpp102
-rw-r--r--llvm/lib/IR/DebugInfoMetadata.cpp2
-rw-r--r--llvm/lib/MC/MCAssembler.cpp43
-rw-r--r--llvm/lib/MC/MCCodeView.cpp2
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp121
-rw-r--r--llvm/lib/MC/MCSection.cpp6
-rw-r--r--llvm/lib/MC/MCWin64EH.cpp3
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp5
-rw-r--r--llvm/lib/Remarks/RemarkLinker.cpp4
-rw-r--r--llvm/lib/Support/APFloat.cpp130
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp122
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td20
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td5
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp75
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td22
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td134
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp12
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h3
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h3
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp29
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp19
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h3
-rw-r--r--llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp15
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp49
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h3
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp13
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp15
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp10
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp380
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp12
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp14
-rw-r--r--llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp24
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp49
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h21
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp157
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td22
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td4
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp23
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td23
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp180
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td2
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp13
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp11
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp1
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp5
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp46
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp7
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp24
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFlatten.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp133
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp28
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanCFG.h7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp14
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp7
105 files changed, 1954 insertions, 1011 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2d52f34..dd98b62 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2679,11 +2679,12 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::nvvm_round_ftz_f:
case Intrinsic::nvvm_round_f:
case Intrinsic::nvvm_round_d: {
- // Use APFloat implementation instead of native libm call, as some
- // implementations (e.g. on PPC) do not preserve the sign of negative 0.
+ // nvvm_round is lowered to PTX cvt.rni, which will round to nearest
+ // integer, choosing even integer if source is equidistant between two
+ // integers, so the semantics are closer to "rint" rather than "round".
bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
- V.roundToIntegral(APFloat::rmNearestTiesToAway);
+ V.roundToIntegral(APFloat::rmNearestTiesToEven);
return ConstantFP::get(Ty->getContext(), V);
}
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 393f264..6fc81d787 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -342,7 +342,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
: SE.getConstantMaxBackedgeTakenCount(L);
}
const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
- L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr);
+ L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC);
if (isa<SCEVCouldNotCompute>(AccessStart) ||
isa<SCEVCouldNotCompute>(AccessEnd))
return false;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 14be385..a553533 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -23,6 +23,8 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
@@ -208,28 +210,46 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
/// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
- const SCEV *MaxBTC,
- const SCEV *EltSize,
- ScalarEvolution &SE,
- const DataLayout &DL) {
+static bool
+evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
+ const SCEV *MaxBTC, const SCEV *EltSize,
+ ScalarEvolution &SE, const DataLayout &DL,
+ DominatorTree *DT, AssumptionCache *AC) {
auto *PointerBase = SE.getPointerBase(AR->getStart());
auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
if (!StartPtr)
return false;
+ const Loop *L = AR->getLoop();
bool CheckForNonNull, CheckForFreed;
- uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes(
+ Value *StartPtrV = StartPtr->getValue();
+ uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
DL, CheckForNonNull, CheckForFreed);
- if (CheckForNonNull || CheckForFreed)
+ if (DerefBytes && (CheckForNonNull || CheckForFreed))
return false;
const SCEV *Step = AR->getStepRecurrence(SE);
+ Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
+ const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
+
+ // Check if we have a suitable dereferencable assumption we can use.
+ if (!StartPtrV->canBeFreed()) {
+ RetainedKnowledge DerefRK = getKnowledgeValidInContext(
+ StartPtrV, {Attribute::Dereferenceable}, *AC,
+ L->getLoopPredecessor()->getTerminator(), DT);
+ if (DerefRK) {
+ DerefBytesSCEV = SE.getUMaxExpr(
+ DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue));
+ }
+ }
+
+ if (DerefBytesSCEV->isZero())
+ return false;
+
bool IsKnownNonNegative = SE.isKnownNonNegative(Step);
if (!IsKnownNonNegative && !SE.isKnownNegative(Step))
return false;
- Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
Step = SE.getNoopOrSignExtend(Step, WiderTy);
MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy);
@@ -256,8 +276,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
if (!EndBytes)
return false;
- return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes,
- SE.getConstant(WiderTy, DerefBytes));
+ return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
}
// For negative steps check if
@@ -265,15 +284,15 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
// * StartOffset <= DerefBytes.
assert(SE.isKnownNegative(Step) && "must be known negative");
return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) &&
- SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset,
- SE.getConstant(WiderTy, DerefBytes));
+ SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV);
}
std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
const SCEV *MaxBTC, ScalarEvolution *SE,
DenseMap<std::pair<const SCEV *, Type *>,
- std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
+ std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+ DominatorTree *DT, AssumptionCache *AC) {
std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
if (PointerBounds) {
auto [Iter, Ins] = PointerBounds->insert(
@@ -308,8 +327,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
// sets ScEnd to the maximum unsigned value for the type. Note that LAA
// separately checks that accesses cannot not wrap, so unsigned max
// represents an upper bound.
- if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE,
- DL)) {
+ if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
+ DT, AC)) {
ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
} else {
ScEnd = SE->getAddExpr(
@@ -356,9 +375,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
bool NeedsFreeze) {
const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
const SCEV *BTC = PSE.getBackedgeTakenCount();
- const auto &[ScStart, ScEnd] =
- getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC,
- PSE.getSE(), &DC.getPointerBounds());
+ const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
+ Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
+ &DC.getPointerBounds(), DC.getDT(), DC.getAC());
assert(!isa<SCEVCouldNotCompute>(ScStart) &&
!isa<SCEVCouldNotCompute>(ScEnd) &&
"must be able to compute both start and end expressions");
@@ -1961,13 +1980,15 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
const SCEV *BTC = PSE.getBackedgeTakenCount();
const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
ScalarEvolution &SE = *PSE.getSE();
- const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
- InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+ const auto &[SrcStart_, SrcEnd_] =
+ getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC,
+ &SE, &PointerBounds, DT, AC);
if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_))
return false;
- const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
- InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+ const auto &[SinkStart_, SinkEnd_] =
+ getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC,
+ &SE, &PointerBounds, DT, AC);
if (isa<SCEVCouldNotCompute>(SinkStart_) ||
isa<SCEVCouldNotCompute>(SinkEnd_))
return false;
@@ -3002,7 +3023,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
const TargetTransformInfo *TTI,
const TargetLibraryInfo *TLI, AAResults *AA,
DominatorTree *DT, LoopInfo *LI,
- bool AllowPartial)
+ AssumptionCache *AC, bool AllowPartial)
: PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3012,8 +3033,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
MaxTargetVectorWidthInBits =
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
- DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
- MaxTargetVectorWidthInBits);
+ DepChecker = std::make_unique<MemoryDepChecker>(
+ *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits);
PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
if (canAnalyzeLoop())
CanVecMem = analyzeLoop(AA, LI, TLI, DT);
@@ -3082,7 +3103,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
// or if it was created with a different value of AllowPartial.
if (Inserted || It->second->hasAllowPartial() != AllowPartial)
It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
- &LI, AllowPartial);
+ &LI, AC, AllowPartial);
return *It->second;
}
@@ -3125,7 +3146,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
auto &LI = FAM.getResult<LoopAnalysis>(F);
auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
- return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
+ auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+ return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
}
AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 9d5c39c..c6fa8f4 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3676,8 +3676,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
TopCand.SU = nullptr;
BotCand.SU = nullptr;
- TopCluster = nullptr;
- BotCluster = nullptr;
+ TopClusterID = InvalidClusterId;
+ BotClusterID = InvalidClusterId;
}
/// Initialize the per-region scheduling policy.
@@ -3988,10 +3988,14 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
@@ -4251,24 +4255,30 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
- TopCluster = DAG->getCluster(SU->ParentClusterIdx);
- LLVM_DEBUG(if (TopCluster) {
- dbgs() << " Top Cluster: ";
- for (auto *N : *TopCluster)
- dbgs() << N->NodeNum << '\t';
- dbgs() << '\n';
+ TopClusterID = SU->ParentClusterIdx;
+ LLVM_DEBUG({
+ if (TopClusterID != InvalidClusterId) {
+ ClusterInfo *TopCluster = DAG->getCluster(TopClusterID);
+ dbgs() << " Top Cluster: ";
+ for (auto *N : *TopCluster)
+ dbgs() << N->NodeNum << '\t';
+ dbgs() << '\n';
+ }
});
Top.bumpNode(SU);
if (SU->hasPhysRegUses)
reschedulePhysReg(SU, true);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
- BotCluster = DAG->getCluster(SU->ParentClusterIdx);
- LLVM_DEBUG(if (BotCluster) {
- dbgs() << " Bot Cluster: ";
- for (auto *N : *BotCluster)
- dbgs() << N->NodeNum << '\t';
- dbgs() << '\n';
+ BotClusterID = SU->ParentClusterIdx;
+ LLVM_DEBUG({
+ if (BotClusterID != InvalidClusterId) {
+ ClusterInfo *BotCluster = DAG->getCluster(BotClusterID);
+ dbgs() << " Bot Cluster: ";
+ for (auto *N : *BotCluster)
+ dbgs() << N->NodeNum << '\t';
+ dbgs() << '\n';
+ }
});
Bot.bumpNode(SU);
if (SU->hasPhysRegDefs)
@@ -4306,8 +4316,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
if (!Bot.HazardRec) {
Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG);
}
- TopCluster = nullptr;
- BotCluster = nullptr;
+ TopClusterID = InvalidClusterId;
+ BotClusterID = InvalidClusterId;
}
void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
@@ -4373,10 +4383,14 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
// Keep clustered nodes together.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
// Avoid critical resource consumption and balance the schedule.
@@ -4575,11 +4589,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
- TopCluster = DAG->getCluster(SU->ParentClusterIdx);
+ TopClusterID = SU->ParentClusterIdx;
Top.bumpNode(SU);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
- BotCluster = DAG->getCluster(SU->ParentClusterIdx);
+ BotClusterID = SU->ParentClusterIdx;
Bot.bumpNode(SU);
}
}
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 69b9291..2400a1f 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -178,10 +178,8 @@ void RegAllocBase::cleanupFailedVReg(Register FailedReg, MCRegister PhysReg,
for (MCRegAliasIterator Aliases(PhysReg, TRI, true); Aliases.isValid();
++Aliases) {
for (MachineOperand &MO : MRI->reg_operands(*Aliases)) {
- if (MO.readsReg()) {
+ if (MO.readsReg())
MO.setIsUndef(true);
- LIS->removeAllRegUnitsForPhysReg(MO.getReg());
- }
}
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a43020e..11e869a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -331,6 +331,11 @@ namespace {
return CombineTo(N, To, 2, AddTo);
}
+ SDValue CombineTo(SDNode *N, SmallVectorImpl<SDValue> *To,
+ bool AddTo = true) {
+ return CombineTo(N, To->data(), To->size(), AddTo);
+ }
+
void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
private:
@@ -541,6 +546,7 @@ namespace {
SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
SDValue visitBUILD_VECTOR(SDNode *N);
SDValue visitCONCAT_VECTORS(SDNode *N);
+ SDValue visitVECTOR_INTERLEAVE(SDNode *N);
SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
SDValue visitVECTOR_SHUFFLE(SDNode *N);
SDValue visitSCALAR_TO_VECTOR(SDNode *N);
@@ -2021,6 +2027,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
+ case ISD::VECTOR_INTERLEAVE: return visitVECTOR_INTERLEAVE(N);
case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
@@ -4100,18 +4107,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
// (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y))
if (N1.hasOneUse() && hasUMin(VT)) {
SDValue Y;
- if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
- m_SpecificCondCode(ISD::SETULT)),
- m_Zero(), m_Deferred(Y))) ||
- sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
- m_SpecificCondCode(ISD::SETUGE)),
- m_Deferred(Y), m_Zero())) ||
- sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
- m_SpecificCondCode(ISD::SETULT)),
- m_Zero(), m_Deferred(Y))) ||
- sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
- m_SpecificCondCode(ISD::SETUGE)),
- m_Deferred(Y), m_Zero())))
+ auto MS0 = m_Specific(N0);
+ auto MVY = m_Value(Y);
+ auto MZ = m_Zero();
+ auto MCC1 = m_SpecificCondCode(ISD::SETULT);
+ auto MCC2 = m_SpecificCondCode(ISD::SETUGE);
+
+ if (sd_match(N1, m_SelectCCLike(MS0, MVY, MZ, m_Deferred(Y), MCC1)) ||
+ sd_match(N1, m_SelectCCLike(MS0, MVY, m_Deferred(Y), MZ, MCC2)) ||
+ sd_match(N1, m_VSelect(m_SetCC(MS0, MVY, MCC1), MZ, m_Deferred(Y))) ||
+ sd_match(N1, m_VSelect(m_SetCC(MS0, MVY, MCC2), m_Deferred(Y), MZ)))
+
return DAG.getNode(ISD::UMIN, DL, VT, N0,
DAG.getNode(ISD::SUB, DL, VT, N0, Y));
}
@@ -10616,6 +10622,19 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
return DAG.getVScale(DL, VT, C0 << C1);
}
+ SDValue X;
+ APInt VS0;
+
+ // fold (shl (X * vscale(VS0)), C1) -> (X * vscale(VS0 << C1))
+ if (N1C && sd_match(N0, m_Mul(m_Value(X), m_VScale(m_ConstInt(VS0))))) {
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
+ N0->getFlags().hasNoUnsignedWrap());
+
+ SDValue VScale = DAG.getVScale(DL, VT, VS0 << N1C->getAPIntValue());
+ return DAG.getNode(ISD::MUL, DL, VT, X, VScale, Flags);
+ }
+
// Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)).
APInt ShlVal;
if (N0.getOpcode() == ISD::STEP_VECTOR &&
@@ -25282,6 +25301,28 @@ static SDValue combineConcatVectorOfShuffleAndItsOperands(
return DAG.getVectorShuffle(VT, dl, ShufOps[0], ShufOps[1], Mask);
}
+static SDValue combineConcatVectorOfSplats(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ bool LegalTypes,
+ bool LegalOperations) {
+ EVT VT = N->getValueType(0);
+
+ // Post-legalization we can only create wider SPLAT_VECTOR operations if both
+ // the type and operation is legal. The Hexagon target has custom
+ // legalization for SPLAT_VECTOR that splits the operation into two parts and
+ // concatenates them. Therefore, custom lowering must also be rejected in
+ // order to avoid an infinite loop.
+ if ((LegalTypes && !TLI.isTypeLegal(VT)) ||
+ (LegalOperations && !TLI.isOperationLegal(ISD::SPLAT_VECTOR, VT)))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ if (!llvm::all_equal(N->op_values()) || Op0.getOpcode() != ISD::SPLAT_VECTOR)
+ return SDValue();
+
+ return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, Op0.getOperand(0));
+}
+
SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
// If we only have one input vector, we don't need to do any concatenation.
if (N->getNumOperands() == 1)
@@ -25405,6 +25446,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
return DAG.getBuildVector(VT, SDLoc(N), Opnds);
}
+ if (SDValue V =
+ combineConcatVectorOfSplats(N, DAG, TLI, LegalTypes, LegalOperations))
+ return V;
+
// Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
// FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
if (SDValue V = combineConcatVectorOfScalars(N, DAG))
@@ -25473,6 +25518,21 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitVECTOR_INTERLEAVE(SDNode *N) {
+ // Check to see if all operands are identical.
+ if (!llvm::all_equal(N->op_values()))
+ return SDValue();
+
+ // Check to see if the identical operand is a splat.
+ if (!DAG.isSplatValue(N->getOperand(0)))
+ return SDValue();
+
+ // interleave splat(X), splat(X).... --> splat(X), splat(X)....
+ SmallVector<SDValue, 4> Ops;
+ Ops.append(N->op_values().begin(), N->op_values().end());
+ return CombineTo(N, &Ops);
+}
+
// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
// if the subvector can be sourced for free.
static SDValue getSubVectorSrc(SDValue V, unsigned Index, EVT SubVT) {
@@ -28965,13 +29025,27 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
((N1C->isAllOnes() && CC == ISD::SETGT) ||
(N1C->isZero() && CC == ISD::SETLT)) &&
!TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) {
- SDValue ASR = DAG.getNode(
- ISD::SRA, DL, CmpOpVT, N0,
- DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT));
- return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT),
+ SDValue ASHR =
+ DAG.getNode(ISD::SRA, DL, CmpOpVT, N0,
+ DAG.getShiftAmountConstant(
+ CmpOpVT.getScalarSizeInBits() - 1, CmpOpVT, DL));
+ return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASHR, DL, VT),
DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
}
+ // Fold sign pattern select_cc setgt X, -1, 1, -1 -> or (ashr X, BW-1), 1
+ if (CC == ISD::SETGT && N1C && N2C && N3C && N1C->isAllOnes() &&
+ N2C->isOne() && N3C->isAllOnes() &&
+ !TLI.shouldAvoidTransformToShift(CmpOpVT,
+ CmpOpVT.getScalarSizeInBits() - 1)) {
+ SDValue ASHR =
+ DAG.getNode(ISD::SRA, DL, CmpOpVT, N0,
+ DAG.getShiftAmountConstant(
+ CmpOpVT.getScalarSizeInBits() - 1, CmpOpVT, DL));
+ return DAG.getNode(ISD::OR, DL, VT, DAG.getSExtOrTrunc(ASHR, DL, VT),
+ DAG.getConstant(1, DL, VT));
+ }
+
if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
return S;
if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 583a85a..a5bd97a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2217,8 +2217,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
switch (getTypeAction(InVT)) {
case TargetLowering::TypePromoteInteger: {
- // TODO: Handle big endian
- if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) {
+ // TODO: Handle big endian & vector input type.
+ if (OutVT.isVector() && !InVT.isVector() &&
+ DAG.getDataLayout().isLittleEndian()) {
EVT EltVT = OutVT.getVectorElementType();
TypeSize EltSize = EltVT.getSizeInBits();
TypeSize NInSize = NInVT.getSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 48d6b99..e3f2a193 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6482,8 +6482,8 @@ SDValue TargetLowering::buildSDIVPow2WithCMov(
Created.push_back(CMov.getNode());
// Divide by pow2.
- SDValue SRA =
- DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, VT));
+ SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CMov,
+ DAG.getShiftAmountConstant(Lg2, VT, DL));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index a8559e7..6de6cc7 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -891,7 +891,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
// Align DataSize to stub alignment if we have any stubs (PaddingSize will
// have been increased above to account for this).
if (StubBufSize > 0)
- DataSize &= -(uint64_t)getStubAlignment().value();
+ DataSize &= -getStubAlignment().value();
}
LLVM_DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: "
diff --git a/llvm/lib/Frontend/Offloading/CMakeLists.txt b/llvm/lib/Frontend/Offloading/CMakeLists.txt
index 8e1ede9..9747dbd 100644
--- a/llvm/lib/Frontend/Offloading/CMakeLists.txt
+++ b/llvm/lib/Frontend/Offloading/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_component_library(LLVMFrontendOffloading
Utility.cpp
OffloadWrapper.cpp
+ PropertySet.cpp
ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend
diff --git a/llvm/lib/Frontend/Offloading/PropertySet.cpp b/llvm/lib/Frontend/Offloading/PropertySet.cpp
new file mode 100644
index 0000000..a70290d
--- /dev/null
+++ b/llvm/lib/Frontend/Offloading/PropertySet.cpp
@@ -0,0 +1,102 @@
+///===- llvm/Frontend/Offloading/PropertySet.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/Offloading/PropertySet.h"
+#include "llvm/Support/Base64.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBufferRef.h"
+
+using namespace llvm;
+using namespace llvm::offloading;
+
+void llvm::offloading::writePropertiesToJSON(
+ const PropertySetRegistry &PSRegistry, raw_ostream &Out) {
+ json::OStream J(Out);
+ J.object([&] {
+ for (const auto &[CategoryName, PropSet] : PSRegistry) {
+ auto PropSetCapture = PropSet;
+ J.attributeObject(CategoryName, [&] {
+ for (const auto &[PropName, PropVal] : PropSetCapture) {
+ switch (PropVal.index()) {
+ case 0:
+ J.attribute(PropName, std::get<uint32_t>(PropVal));
+ break;
+ case 1:
+ J.attribute(PropName, encodeBase64(std::get<ByteArray>(PropVal)));
+ break;
+ default:
+ llvm_unreachable("unsupported property type");
+ }
+ }
+ });
+ }
+ });
+}
+
+// note: createStringError has an overload that takes a format string,
+// but it uses llvm::format instead of llvm::formatv, which does
+// not work with json::Value. This is a helper function to use
+// llvm::formatv with createStringError.
+template <typename... Ts> auto createStringErrorV(Ts &&...Args) {
+ return createStringError(formatv(std::forward<Ts>(Args)...));
+}
+
+Expected<PropertyValue>
+readPropertyValueFromJSON(const json::Value &PropValueVal) {
+ if (std::optional<uint64_t> Val = PropValueVal.getAsUINT64())
+ return PropertyValue(static_cast<uint32_t>(*Val));
+
+ if (std::optional<StringRef> Val = PropValueVal.getAsString()) {
+ std::vector<char> Decoded;
+ if (Error E = decodeBase64(*Val, Decoded))
+ return createStringErrorV("unable to base64 decode the string {0}: {1}",
+ Val, toString(std::move(E)));
+ return PropertyValue(ByteArray(Decoded.begin(), Decoded.end()));
+ }
+
+ return createStringErrorV("expected a uint64 or a string, got {0}",
+ PropValueVal);
+}
+
+Expected<PropertySetRegistry>
+llvm::offloading::readPropertiesFromJSON(MemoryBufferRef Buf) {
+ PropertySetRegistry Res;
+ Expected<json::Value> V = json::parse(Buf.getBuffer());
+ if (Error E = V.takeError())
+ return E;
+
+ const json::Object *O = V->getAsObject();
+ if (!O)
+ return createStringErrorV(
+ "error while deserializing property set registry: "
+ "expected JSON object, got {0}",
+ *V);
+
+ for (const auto &[CategoryName, Value] : *O) {
+ const json::Object *PropSetVal = Value.getAsObject();
+ if (!PropSetVal)
+ return createStringErrorV("error while deserializing property set {0}: "
+ "expected JSON array, got {1}",
+ CategoryName.str(), Value);
+
+ PropertySet &PropSet = Res[CategoryName.str()];
+ for (const auto &[PropName, PropValueVal] : *PropSetVal) {
+ Expected<PropertyValue> Prop = readPropertyValueFromJSON(PropValueVal);
+ if (Error E = Prop.takeError())
+ return createStringErrorV(
+ "error while deserializing property {0} in property set {1}: {2}",
+ PropName.str(), CategoryName.str(), toString(std::move(E)));
+
+ auto [It, Inserted] =
+ PropSet.try_emplace(PropName.str(), std::move(*Prop));
+ assert(Inserted && "Property already exists in PropertySet");
+ (void)Inserted;
+ }
+ }
+ return Res;
+}
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f16963d..f1d4549 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1012,7 +1012,7 @@ DIDerivedType *DIDerivedType::getImpl(
std::optional<DIDerivedType::PtrAuthData>
DIDerivedType::getPtrAuthData() const {
return getTag() == dwarf::DW_TAG_LLVM_ptrauth_type
- ? std::optional<PtrAuthData>(PtrAuthData(SubclassData32))
+ ? std::make_optional<PtrAuthData>(SubclassData32)
: std::nullopt;
}
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 8500fd1..5c8e904 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -59,7 +59,8 @@ STATISTIC(EmittedFillFragments,
"Number of emitted assembler fragments - fill");
STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops");
STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org");
-STATISTIC(evaluateFixup, "Number of evaluated fixups");
+STATISTIC(Fixups, "Number of fixups");
+STATISTIC(FixupEvalForRelax, "Number of fixup evaluations for relaxation");
STATISTIC(ObjectBytes, "Number of emitted object file bytes");
STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
@@ -140,9 +141,9 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup,
MCValue &Target, uint64_t &Value,
- bool RecordReloc,
- MutableArrayRef<char> Contents) const {
- ++stats::evaluateFixup;
+ bool RecordReloc, uint8_t *Data) const {
+ if (RecordReloc)
+ ++stats::Fixups;
// FIXME: This code has some duplication with recordRelocation. We should
// probably merge the two into a single callback that tries to evaluate a
@@ -185,7 +186,7 @@ bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup,
if (IsResolved && mc::isRelocRelocation(Fixup.getKind()))
IsResolved = false;
- getBackend().applyFixup(F, Fixup, Target, Contents, Value, IsResolved);
+ getBackend().applyFixup(F, Fixup, Target, Data, Value, IsResolved);
return true;
}
@@ -703,21 +704,25 @@ void MCAssembler::layout() {
for (MCFixup &Fixup : F.getFixups()) {
uint64_t FixedValue;
MCValue Target;
+ assert(mc::isRelocRelocation(Fixup.getKind()) ||
+ Fixup.getOffset() <= F.getFixedSize());
+ auto *Data =
+ reinterpret_cast<uint8_t *>(Contents.data() + Fixup.getOffset());
evaluateFixup(F, Fixup, Target, FixedValue,
- /*RecordReloc=*/true, Contents);
+ /*RecordReloc=*/true, Data);
}
- if (F.getVarFixups().size()) {
- // In the variable part, fixup offsets are relative to the fixed part's
- // start. Extend the variable contents to the left to account for the
- // fixed part size.
- Contents = MutableArrayRef(F.getParent()->ContentStorage)
- .slice(F.VarContentStart - Contents.size(), F.getSize());
- for (MCFixup &Fixup : F.getVarFixups()) {
- uint64_t FixedValue;
- MCValue Target;
- evaluateFixup(F, Fixup, Target, FixedValue,
- /*RecordReloc=*/true, Contents);
- }
+ // In the variable part, fixup offsets are relative to the fixed part's
+ // start.
+ for (MCFixup &Fixup : F.getVarFixups()) {
+ uint64_t FixedValue;
+ MCValue Target;
+ assert(mc::isRelocRelocation(Fixup.getKind()) ||
+ (Fixup.getOffset() >= F.getFixedSize() &&
+ Fixup.getOffset() <= F.getSize()));
+ auto *Data = reinterpret_cast<uint8_t *>(
+ F.getVarContents().data() + (Fixup.getOffset() - F.getFixedSize()));
+ evaluateFixup(F, Fixup, Target, FixedValue,
+ /*RecordReloc=*/true, Data);
}
}
}
@@ -735,7 +740,7 @@ void MCAssembler::Finish() {
bool MCAssembler::fixupNeedsRelaxation(const MCFragment &F,
const MCFixup &Fixup) const {
- assert(getBackendPtr() && "Expected assembler backend");
+ ++stats::FixupEvalForRelax;
MCValue Target;
uint64_t Value;
bool Resolved = evaluateFixup(F, const_cast<MCFixup &>(Fixup), Target, Value,
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 7d528a5..3a5f01c 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -695,5 +695,7 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
}
Frag.setVarContents(Contents);
+ assert(Fixups.size() < 256 && "Store fixups outside of MCFragment's VarFixup "
+ "storage if the number ever exceeds 256");
Frag.setVarFixups(Fixups);
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index e277143..200e29a 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -46,27 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
return nullptr;
}
+constexpr size_t FragBlockSize = 16384;
+// Ensure the new fragment can at least store a few bytes.
+constexpr size_t NewFragHeadroom = 8;
+
+static_assert(NewFragHeadroom >= alignof(MCFragment));
+static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom);
+
+MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) {
+ auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom);
+ FragSpace = Size - sizeof(MCFragment);
+ auto Block = std::unique_ptr<uint8_t[]>(new uint8_t[Size]);
+ auto *F = reinterpret_cast<MCFragment *>(Block.get());
+ FragStorage.push_back(std::move(Block));
+ return F;
+}
+
void MCObjectStreamer::newFragment() {
- addFragment(getContext().allocFragment<MCFragment>());
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ addFragment(F);
+}
+
+void MCObjectStreamer::ensureHeadroom(size_t Headroom) {
+ if (Headroom <= FragSpace)
+ return;
+ auto *F = allocFragSpace(Headroom);
+ new (F) MCFragment();
+ addFragment(F);
}
-void MCObjectStreamer::insert(MCFragment *F) {
- assert(F->getKind() != MCFragment::FT_Data &&
+void MCObjectStreamer::insert(MCFragment *Frag) {
+ assert(Frag->getKind() != MCFragment::FT_Data &&
"F should have a variable-size tail");
+ // Frag is not connected to FragSpace. Before modifying CurFrag with
+ // addFragment(Frag), allocate an empty fragment to maintain FragSpace
+ // connectivity, potentially reusing CurFrag's associated space.
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+
+ addFragment(Frag);
addFragment(F);
- newFragment();
}
void MCObjectStreamer::appendContents(ArrayRef<char> Contents) {
- CurFrag->appendContents(Contents);
+ ensureHeadroom(Contents.size());
+ assert(FragSpace >= Contents.size());
+ llvm::copy(Contents, getCurFragEnd());
+ CurFrag->FixedSize += Contents.size();
+ FragSpace -= Contents.size();
}
-void MCObjectStreamer::appendContents(size_t Num, char Elt) {
- CurFrag->appendContents(Num, Elt);
+void MCObjectStreamer::appendContents(size_t Num, uint8_t Elt) {
+ ensureHeadroom(Num);
+ MutableArrayRef<uint8_t> Data(getCurFragEnd(), Num);
+ llvm::fill(Data, Elt);
+ CurFrag->FixedSize += Num;
+ FragSpace -= Num;
}
void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
- CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+ CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind));
}
// As a compile-time optimization, avoid allocating and evaluating an MCExpr
@@ -115,6 +171,8 @@ void MCObjectStreamer::reset() {
}
EmitEHFrame = true;
EmitDebugFrame = false;
+ FragStorage.clear();
+ FragSpace = 0;
MCStreamer::reset();
}
@@ -143,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
SMLoc Loc) {
MCStreamer::emitValueImpl(Value, Size, Loc);
- MCFragment *DF = getCurrentFragment();
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -158,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
emitIntValue(AbsValue, Size);
return;
}
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- MCFixup::getDataKindForSize(Size)));
- DF->appendContents(Size, 0);
+ ensureHeadroom(Size);
+ addFixup(Value, MCFixup::getDataKindForSize(Size));
+ appendContents(Size, 0);
}
MCSymbol *MCObjectStreamer::emitCFILabel() {
@@ -194,7 +251,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// section.
MCFragment *F = CurFrag;
Symbol->setFragment(F);
- Symbol->setOffset(F->getContents().size());
+ Symbol->setOffset(F->getFixedSize());
emitPendingAssignments(Symbol);
}
@@ -260,6 +317,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
F0 = CurFrag;
}
+ // To maintain connectivity between CurFrag and FragSpace when CurFrag is
+ // modified, allocate an empty fragment and append it to the fragment list.
+ // (Subsections[I].second.Tail is not connected to FragSpace.)
+ MCFragment *F;
+ if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+ auto End = reinterpret_cast<size_t>(getCurFragEnd());
+ F = reinterpret_cast<MCFragment *>(
+ alignToPowerOf2(End, alignof(MCFragment)));
+ FragSpace -= size_t(F) - End + sizeof(MCFragment);
+ } else {
+ F = allocFragSpace(0);
+ }
+ new (F) MCFragment();
+ F->setParent(Section);
+
auto &Subsections = Section->Subsections;
size_t I = 0, E = Subsections.size();
while (I != E && Subsections[I].first < Subsection)
@@ -267,13 +339,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
// If the subsection number is not in the sorted Subsections list, create a
// new fragment list.
if (I == E || Subsections[I].first != Subsection) {
- auto *F = getContext().allocFragment<MCFragment>();
- F->setParent(Section);
Subsections.insert(Subsections.begin() + I,
{Subsection, MCSection::FragList{F, F}});
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = F;
+ } else {
+ Section->CurFragList = &Subsections[I].second;
+ CurFrag = Subsections[I].second.Tail;
+ // Ensure CurFrag is associated with FragSpace.
+ addFragment(F);
}
- Section->CurFragList = &Subsections[I].second;
- CurFrag = Section->CurFragList->Tail;
// Define the section symbol at subsection 0's initial fragment if required.
if (!NewSec)
@@ -344,11 +419,15 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
MCFragment *F = getCurrentFragment();
// Append the instruction to the data fragment.
- size_t CodeOffset = F->getContents().size();
+ size_t CodeOffset = getCurFragSize();
+ SmallString<16> Content;
SmallVector<MCFixup, 1> Fixups;
- getAssembler().getEmitter().encodeInstruction(
- Inst, F->getContentsForAppending(), Fixups, STI);
- F->doneAppending();
+ getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI);
+ appendContents(Content);
+ if (CurFrag != F) {
+ F = CurFrag;
+ CodeOffset = 0;
+ }
F->setHasInstructions(STI);
if (Fixups.empty())
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 4f28267..27ca131 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -83,12 +83,14 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
}
void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) {
+ assert(Fixups.size() < 256 &&
+ "variable-size tail cannot have more than 256 fixups");
auto &S = getParent()->FixupStorage;
- if (VarFixupStart + Fixups.size() > VarFixupEnd) {
+ if (Fixups.size() > VarFixupSize) {
VarFixupStart = S.size();
S.resize_for_overwrite(S.size() + Fixups.size());
}
- VarFixupEnd = VarFixupStart + Fixups.size();
+ VarFixupSize = Fixups.size();
// Source fixup offsets are relative to the variable part's start. Add the
// fixed part size to make them relative to the fixed part's start.
std::transform(Fixups.begin(), Fixups.end(), S.begin() + VarFixupStart,
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 72a8dd7..a87648a 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// Emit the epilog instructions.
if (EnableUnwindV2) {
+ // Ensure the fixups and appended content apply to the same fragment.
+ OS->ensureHeadroom(info->EpilogMap.size() * 2);
+
bool IsLast = true;
for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
if (IsLast) {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 1ffe25c..8be5054 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -280,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
visitUsedSymbol(*Symbol);
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
+ ensureHeadroom(2);
addFixup(SRE, FK_SecRel_2);
appendContents(2, 0);
}
@@ -293,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_SecRel_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -308,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -318,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
// Create Symbol for section number.
const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
*Symbol, this->getWriter(), getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
@@ -328,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
// Create Symbol for section offset.
const MCExpr *MCE =
MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+ ensureHeadroom(4);
addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
appendContents(4, 0);
diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp
index 0ca6217..b00419b 100644
--- a/llvm/lib/Remarks/RemarkLinker.cpp
+++ b/llvm/lib/Remarks/RemarkLinker.cpp
@@ -70,8 +70,8 @@ Error RemarkLinker::link(StringRef Buffer, Format RemarkFormat) {
Expected<std::unique_ptr<RemarkParser>> MaybeParser =
createRemarkParserFromMeta(
RemarkFormat, Buffer,
- PrependPath ? std::optional<StringRef>(StringRef(*PrependPath))
- : std::optional<StringRef>());
+ PrependPath ? std::make_optional<StringRef>(*PrependPath)
+ : std::nullopt);
if (!MaybeParser)
return MaybeParser.takeError();
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 5e0b29f..46084c5 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -900,6 +900,30 @@ writeSignedDecimal (char *dst, int value)
return dst;
}
+// Compute the ULP of the input using a definition from:
+// Jean-Michel Muller. On the definition of ulp(x). [Research Report] RR-5504,
+// LIP RR-2005-09, INRIA, LIP. 2005, pp.16. inria-00070503
+static APFloat harrisonUlp(const APFloat &X) {
+ const fltSemantics &Sem = X.getSemantics();
+ switch (X.getCategory()) {
+ case APFloat::fcNaN:
+ return APFloat::getQNaN(Sem);
+ case APFloat::fcInfinity:
+ return APFloat::getInf(Sem);
+ case APFloat::fcZero:
+ return APFloat::getSmallest(Sem);
+ case APFloat::fcNormal:
+ break;
+ }
+ if (X.isDenormal() || X.isSmallestNormalized())
+ return APFloat::getSmallest(Sem);
+ int Exp = ilogb(X);
+ if (X.getExactLog2() != INT_MIN)
+ Exp -= 1;
+ return scalbn(APFloat::getOne(Sem), Exp - (Sem.precision - 1),
+ APFloat::rmNearestTiesToEven);
+}
+
namespace detail {
/* Constructors. */
void IEEEFloat::initialize(const fltSemantics *ourSemantics) {
@@ -5306,12 +5330,110 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
return Ret;
}
+// The double-double lattice of values corresponds to numbers which obey:
+// - abs(lo) <= 1/2 * ulp(hi)
+// - roundTiesToEven(hi + lo) == hi
+//
+// nextUp must choose the smallest output > input that follows these rules.
+// nexDown must choose the largest output < input that follows these rules.
APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
- auto Ret = Tmp.next(nextDown);
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
- return Ret;
+ // nextDown(x) = -nextUp(-x)
+ if (nextDown) {
+ changeSign();
+ APFloat::opStatus Result = next(/*nextDown=*/false);
+ changeSign();
+ return Result;
+ }
+ switch (getCategory()) {
+ case fcInfinity:
+ // nextUp(+inf) = +inf
+ // nextUp(-inf) = -getLargest()
+ if (isNegative())
+ makeLargest(true);
+ return opOK;
+
+ case fcNaN:
+ // IEEE-754R 2008 6.2 Par 2: nextUp(sNaN) = qNaN. Set Invalid flag.
+ // IEEE-754R 2008 6.2: nextUp(qNaN) = qNaN. Must be identity so we do not
+ // change the payload.
+ if (getFirst().isSignaling()) {
+ // For consistency, propagate the sign of the sNaN to the qNaN.
+ makeNaN(false, isNegative(), nullptr);
+ return opInvalidOp;
+ }
+ return opOK;
+
+ case fcZero:
+ // nextUp(pm 0) = +getSmallest()
+ makeSmallest(false);
+ return opOK;
+
+ case fcNormal:
+ break;
+ }
+
+ const APFloat &HiOld = getFirst();
+ const APFloat &LoOld = getSecond();
+
+ APFloat NextLo = LoOld;
+ NextLo.next(/*nextDown=*/false);
+
+ // We want to admit values where:
+ // 1. abs(Lo) <= ulp(Hi)/2
+ // 2. Hi == RTNE(Hi + lo)
+ auto InLattice = [](const APFloat &Hi, const APFloat &Lo) {
+ return Hi + Lo == Hi;
+ };
+
+ // Check if (HiOld, nextUp(LoOld) is in the lattice.
+ if (InLattice(HiOld, NextLo)) {
+ // Yes, the result is (HiOld, nextUp(LoOld)).
+ Floats[1] = std::move(NextLo);
+
+ // TODO: Because we currently rely on semPPCDoubleDoubleLegacy, our maximum
+ // value is defined to have exactly 106 bits of precision. This limitation
+ // results in semPPCDoubleDouble being unable to reach its maximum canonical
+ // value.
+ DoubleAPFloat Largest{*Semantics, uninitialized};
+ Largest.makeLargest(/*Neg=*/false);
+ if (compare(Largest) == cmpGreaterThan)
+ makeInf(/*Neg=*/false);
+
+ return opOK;
+ }
+
+ // Now we need to handle the cases where (HiOld, nextUp(LoOld)) is not the
+ // correct result. We know the new hi component will be nextUp(HiOld) but our
+ // lattice rules make it a little ambiguous what the correct NextLo must be.
+ APFloat NextHi = HiOld;
+ NextHi.next(/*nextDown=*/false);
+
+ // nextUp(getLargest()) == INFINITY
+ if (NextHi.isInfinity()) {
+ makeInf(/*Neg=*/false);
+ return opOK;
+ }
+
+ // IEEE 754-2019 5.3.1:
+ // "If x is the negative number of least magnitude in x's format, nextUp(x) is
+ // -0."
+ if (NextHi.isZero()) {
+ makeZero(/*Neg=*/true);
+ return opOK;
+ }
+
+ // abs(NextLo) must be <= ulp(NextHi)/2. We want NextLo to be as close to
+ // negative infinity as possible.
+ NextLo = neg(scalbn(harrisonUlp(NextHi), -1, rmTowardZero));
+ if (!InLattice(NextHi, NextLo))
+ // RTNE may mean that Lo must be < ulp(NextHi) / 2 so we bump NextLo.
+ NextLo.next(/*nextDown=*/false);
+
+ Floats[0] = std::move(NextHi);
+ Floats[1] = std::move(NextLo);
+
+ return opOK;
}
APFloat::opStatus
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 509cbb0..e8d3161 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -813,8 +813,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
}
}
- if (!F.hasFnAttribute(Attribute::HybridPatchable) || F.isDeclaration() ||
- F.hasLocalLinkage() ||
+ if (!F.hasFnAttribute(Attribute::HybridPatchable) ||
+ F.isDeclarationForLinker() || F.hasLocalLinkage() ||
F.getName().ends_with(HybridPatchableTargetSuffix))
continue;
@@ -857,7 +857,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
SetVector<GlobalValue *> DirectCalledFns;
for (Function &F : Mod)
- if (!F.isDeclaration() &&
+ if (!F.isDeclarationForLinker() &&
F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native &&
F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64)
processFunction(F, DirectCalledFns, FnsMap);
@@ -869,7 +869,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
};
SmallVector<ThunkInfo> ThunkMapping;
for (Function &F : Mod) {
- if (!F.isDeclaration() && (!F.hasLocalLinkage() || F.hasAddressTaken()) &&
+ if (!F.isDeclarationForLinker() &&
+ (!F.hasLocalLinkage() || F.hasAddressTaken()) &&
F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native &&
F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64) {
if (!F.hasComdat())
@@ -959,7 +960,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
// unprototyped functions in C)
if (Function *F = CB->getCalledFunction()) {
if (!LowerDirectToIndirect || F->hasLocalLinkage() ||
- F->isIntrinsic() || !F->isDeclaration())
+ F->isIntrinsic() || !F->isDeclarationForLinker())
continue;
DirectCalledFns.insert(F);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4f6e3dd..2b6ea86 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -162,10 +162,10 @@ static cl::opt<bool> UseFEATCPACodegen(
cl::init(false));
/// Value type used for condition codes.
-static const MVT MVT_CC = MVT::i32;
+constexpr MVT CondCodeVT = MVT::i32;
/// Value type used for NZCV flags.
-static constexpr MVT FlagsVT = MVT::i32;
+constexpr MVT FlagsVT = MVT::i32;
static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
@@ -3472,6 +3472,12 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
}
}
+/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
+static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC) {
+ // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
+ return DAG.getConstant(CC, SDLoc(), CondCodeVT);
+}
+
static bool isLegalArithImmed(uint64_t C) {
// Matches AArch64DAGToDAGISel::SelectArithImmed().
bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
@@ -3678,7 +3684,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
- SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+ SDValue Condition = getCondCode(DAG, Predicate);
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
@@ -4075,7 +4081,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
}
- AArch64cc = DAG.getConstant(AArch64CC, DL, MVT_CC);
+ AArch64cc = getCondCode(DAG, AArch64CC);
return Cmp;
}
@@ -4195,7 +4201,7 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
AArch64CC::CondCode CC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
- SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
}
@@ -4274,8 +4280,8 @@ static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
SDLoc DL(Glue);
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
- unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
- SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
+ AArch64CC::CondCode Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
+ SDValue CC = getCondCode(DAG, Cond);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
}
@@ -4285,7 +4291,7 @@ static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
SDLoc DL(Glue);
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
- SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
+ SDValue CC = getCondCode(DAG, AArch64CC::VS);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
}
@@ -4334,7 +4340,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// We use an inverted condition, because the conditional select is inverted
// too. This will allow it to be selected to a single instruction:
// CSINC Wd, WZR, WZR, invert(cond).
- SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
Overflow =
DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
@@ -7124,8 +7130,7 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
Op.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
- DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
- Cmp.getValue(1));
+ getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
}
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
@@ -7136,7 +7141,7 @@ static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
AArch64CC::CondCode CC;
if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
SDLoc DL(Op);
- SDValue CCVal = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, CC);
return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
Cmp);
}
@@ -10575,7 +10580,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
if (CC == ISD::SETNE)
OFCC = getInvertedCondCode(OFCC);
- SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, OFCC);
return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
Overflow);
@@ -10648,7 +10653,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
AArch64CC::isValidCBCond(changeIntCCToAArch64CC(CC)) &&
ProduceNonFlagSettingCondBr) {
SDValue Cond =
- DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, MVT::i32);
+ DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, CondCodeVT);
return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
Dest);
}
@@ -10667,11 +10672,11 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
- SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32);
+ SDValue CC1Val = getCondCode(DAG, CC1);
SDValue BR1 =
DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
if (CC2 != AArch64CC::AL) {
- SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32);
+ SDValue CC2Val = getCondCode(DAG, CC2);
return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
Cmp);
}
@@ -11160,7 +11165,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
CC2);
- SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32);
+ SDValue CC1Val = getCondCode(DAG, CC1);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
@@ -11173,11 +11178,11 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// of the first as the RHS. We're effectively OR'ing the two CC's together.
// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
- SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32);
+ SDValue CC1Val = getCondCode(DAG, CC1);
SDValue CS1 =
DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
- SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32);
+ SDValue CC2Val = getCondCode(DAG, CC2);
Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
}
return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
@@ -11205,8 +11210,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
- SDValue CCVal =
- DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
// Inputs are swapped because the condition is inverted. This will allow
// matching with a single CSINC instruction.
return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
@@ -11360,18 +11364,6 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
- // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
- // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
- // supported types.
- if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
- CTVal->isOne() && CFVal->isAllOnes() &&
- LHS.getValueType() == TVal.getValueType()) {
- EVT VT = LHS.getValueType();
- SDValue Shift =
- DAG.getNode(ISD::SRA, DL, VT, LHS,
- DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
- return DAG.getNode(ISD::OR, DL, VT, Shift, DAG.getConstant(1, DL, VT));
- }
// Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
// (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
@@ -11577,13 +11569,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
}
// Emit first, and possibly only, CSEL.
- SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32);
+ SDValue CC1Val = getCondCode(DAG, CC1);
SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
// If we need a second CSEL, emit it, using the output of the first as the
// RHS. We're effectively OR'ing the two CC's together.
if (CC2 != AArch64CC::AL) {
- SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32);
+ SDValue CC2Val = getCondCode(DAG, CC2);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
}
@@ -11685,7 +11677,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
- SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, OFCC);
return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
@@ -12525,10 +12517,10 @@ static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
/// WZR, invert(<cond>)'.
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
SelectionDAG &DAG) {
- return DAG.getNode(
- AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
+ return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
}
// Lower @cc flag output via getSETCC.
@@ -18699,7 +18691,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
Created.push_back(Cmp.getNode());
Created.push_back(And.getNode());
} else {
- SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
+ SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
SDVTList VTs = DAG.getVTList(VT, FlagsVT);
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
@@ -19571,11 +19563,11 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::AND) {
AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
- Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
+ Condition = getCondCode(DAG, InvCC0);
NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
} else {
AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
- Condition = DAG.getConstant(CC0, DL, MVT_CC);
+ Condition = getCondCode(DAG, CC0);
NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
}
@@ -19596,8 +19588,7 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
}
return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
- CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
- CCmp);
+ CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
}
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
@@ -19802,7 +19793,7 @@ static SDValue performANDSETCCCombine(SDNode *N,
SDLoc DL(N);
return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
DAG.getConstant(0, DL, VT),
- DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
+ getCondCode(DAG, InvertedCC), Cmp);
}
}
return SDValue();
@@ -20793,7 +20784,7 @@ static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
"Unexpected constant value");
SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
- SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
+ SDValue CCVal = getCondCode(DAG, AArch64CC);
SDValue Cmp = LHS.getOperand(3);
return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
@@ -20979,7 +20970,7 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
// (CINC x cc cond) <=> (CSINC x x !cc cond)
- SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
+ SDValue CC = getCondCode(DAG, AArch64CC::LO);
return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
}
@@ -22052,7 +22043,7 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
// Convert CC to integer based on requested condition.
// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
- SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
+ SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
return DAG.getZExtOrTrunc(Res, DL, VT);
}
@@ -25093,10 +25084,9 @@ static SDValue performBRCONDCombine(SDNode *N,
auto CSelCC = getCSETCondCode(CSel);
if (CSelCC) {
SDLoc DL(N);
- return DAG.getNode(
- N->getOpcode(), DL, N->getVTList(), Chain, Dest,
- DAG.getConstant(getInvertedCondCode(*CSelCC), DL, MVT::i32),
- CSel.getOperand(3));
+ return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
+ getCondCode(DAG, getInvertedCondCode(*CSelCC)),
+ CSel.getOperand(3));
}
}
@@ -25237,7 +25227,7 @@ static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
SDLoc DL(Op);
EVT VT = Op->getValueType(0);
- SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
+ SDValue CCValue = getCondCode(DAG, CC);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
}
@@ -25314,8 +25304,7 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
SDValue TValReassoc = Reassociate(TReassocOp, 0);
SDValue FValReassoc = Reassociate(FReassocOp, 1);
return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
- DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
- NewCmp.getValue(1));
+ getCondCode(DAG, NewCC), NewCmp.getValue(1));
};
auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
@@ -25456,8 +25445,7 @@ static SDValue performCSELCombine(SDNode *N,
SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
Cond.getOperand(1), Cond.getOperand(0));
return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
- N->getOperand(1),
- DAG.getConstant(NewCond, DL, MVT::i32),
+ N->getOperand(1), getCondCode(DAG, NewCond),
Sub.getValue(1));
}
}
@@ -25557,10 +25545,9 @@ static SDValue performSETCCCombine(SDNode *N,
auto NewCond = getInvertedCondCode(OldCond);
// csel 0, 1, !cond, X
- SDValue CSEL =
- DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
- LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
- LHS.getOperand(3));
+ SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
+ LHS.getOperand(0), LHS.getOperand(1),
+ getCondCode(DAG, NewCond), LHS.getOperand(3));
return DAG.getZExtOrTrunc(CSEL, DL, VT);
}
@@ -25630,8 +25617,7 @@ static SDValue performFlagSettingCombine(SDNode *N,
// If the flag result isn't used, convert back to a generic opcode.
if (!N->hasAnyUseOfValue(1)) {
SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
- return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
- DL);
+ return DCI.CombineTo(N, Res, SDValue(N, 1));
}
// Combine identical generic nodes into this node, re-using the result.
@@ -27013,10 +26999,10 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
SDValue A = DAG.getNode(
AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
- SDValue B = DAG.getNode(
- AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
+ SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ getCondCode(DAG, AArch64CC::NE), A.getValue(1));
return DAG.getMergeValues(
{A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 251fd44..ac31236 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -448,8 +448,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
SDTCisVT<1, FlagsVT>,
SDTCisVT<4, FlagsVT>]>;
+// Value type used for condition codes.
+// Should be kept in sync with its C++ counterpart.
+defvar CondCodeVT = i32;
+
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
- [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, CondCodeVT>,
SDTCisVT<2, FlagsVT>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
@@ -458,22 +463,22 @@ def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
- SDTCisInt<3>,
+ SDTCisVT<3, CondCodeVT>,
SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
[SDTCisVT<0, FlagsVT>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
- SDTCisInt<4>,
- SDTCisVT<5, i32>]>;
+ SDTCisVT<4, CondCodeVT>,
+ SDTCisVT<5, FlagsVT>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
[SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
- SDTCisInt<4>,
- SDTCisVT<5, i32>]>;
+ SDTCisVT<4, CondCodeVT>,
+ SDTCisVT<5, FlagsVT>]>;
def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
@@ -546,7 +551,8 @@ def SDT_AArch64TBL : SDTypeProfile<1, 2, [
]>;
def SDT_AArch64cb : SDTypeProfile<0, 4,
- [SDTCisVT<0, i32>, SDTCisInt<1>, SDTCisInt<2>,
+ [SDTCisVT<0, CondCodeVT>,
+ SDTCisInt<1>, SDTCisInt<2>,
SDTCisVT<3, OtherVT>]>;
// non-extending masked load fragment.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 18ca22f..e1adc0b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -270,6 +270,13 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
SMECallAttrs CallAttrs(*Caller, *Callee);
+ // Never inline a function explicitly marked as being streaming,
+ // into a non-streaming function. Assume it was marked as streaming
+ // for a reason.
+ if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
+ CallAttrs.callee().hasStreamingInterfaceOrBody())
+ return false;
+
// When inlining, we should consider the body of the function, not the
// interface.
if (CallAttrs.callee().hasStreamingBody()) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index b9d3e1b..7a2b679 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -79,8 +79,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value) const override;
@@ -421,9 +420,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
}
void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (shouldForceRelocation(Fixup))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -460,8 +458,8 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -471,15 +469,16 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (FulleSizeInBytes == 0) {
// Handle as little-endian
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
} else {
// Handle as big-endian
- assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
+ assert(Fixup.getOffset() + FulleSizeInBytes <= F.getSize() &&
+ "Invalid fixup size!");
assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = FulleSizeInBytes - 1 - i;
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
@@ -492,9 +491,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// If the immediate is negative, generate MOVN else MOVZ.
// (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
if (SignedValue < 0)
- Data[Offset + 3] &= ~(1 << 6);
+ Data[3] &= ~(1 << 6);
else
- Data[Offset + 3] |= (1 << 6);
+ Data[3] |= (1 << 6);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8a0c4ac..18f3c47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
"Has v_tanh_f32/f16 instructions"
>;
+def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
+ "HasTensorCvtLutInsts",
+ "true",
+ "Has v_perm_pk16* instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTanhInsts,
+ FeatureTensorCvtLutInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
@@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
AssemblerPredicate<(all_of FeatureTanhInsts)>;
+def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
+ AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 31c4f62..d059480 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
+
+ setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
+ setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
+ setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
+
+ setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
+ setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
+ setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
+
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c8e45d4..d11e5a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3204,6 +3204,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 5);
return;
}
+ case Intrinsic::amdgcn_permlane_bcast:
+ case Intrinsic::amdgcn_permlane_up:
+ case Intrinsic::amdgcn_permlane_down:
+ case Intrinsic::amdgcn_permlane_xor:
+ // Doing a waterfall loop over these wouldn't make any sense.
+ constrainOpWithReadfirstlane(B, MI, 3);
+ constrainOpWithReadfirstlane(B, MI, 4);
+ return;
+ case Intrinsic::amdgcn_permlane_idx_gen: {
+ constrainOpWithReadfirstlane(B, MI, 3);
+ return;
+ }
case Intrinsic::amdgcn_sbfe:
applyMappingBFE(B, OpdMapper, true);
return;
@@ -4591,6 +4603,42 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
@@ -4765,6 +4813,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
+ case Intrinsic::amdgcn_perm_pk16_b4_u4:
+ case Intrinsic::amdgcn_perm_pk16_b6_u4:
+ case Intrinsic::amdgcn_perm_pk16_b8_u4:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
@@ -4902,6 +4953,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_permlane_bcast:
+ case Intrinsic::amdgcn_permlane_up:
+ case Intrinsic::amdgcn_permlane_down:
+ case Intrinsic::amdgcn_permlane_xor: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_permlane_idx_gen: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_permlane16_var:
case Intrinsic::amdgcn_permlanex16_var: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index dfe0cbf..10b8606 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -321,6 +321,11 @@ def : SourceOfDivergence<int_amdgcn_permlane16>;
def : SourceOfDivergence<int_amdgcn_permlanex16>;
def : SourceOfDivergence<int_amdgcn_permlane16_var>;
def : SourceOfDivergence<int_amdgcn_permlanex16_var>;
+def : SourceOfDivergence<int_amdgcn_permlane_bcast>;
+def : SourceOfDivergence<int_amdgcn_permlane_up>;
+def : SourceOfDivergence<int_amdgcn_permlane_down>;
+def : SourceOfDivergence<int_amdgcn_permlane_xor>;
+def : SourceOfDivergence<int_amdgcn_permlane_idx_gen>;
def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 94886b0..96cb5ae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -152,7 +152,12 @@ static bool isPermlane(const MachineInstr &MI) {
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
- Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
+ Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
}
static bool isLdsDma(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 33b66a6..96d5668 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -592,10 +592,13 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
@@ -666,10 +669,13 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
@@ -936,11 +942,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
Pressure.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RegionsWithExcessRP.resize(Regions.size());
- RegionsWithMinOcc.resize(Regions.size());
RegionsWithIGLPInstrs.resize(Regions.size());
RegionsWithHighRP.reset();
RegionsWithExcessRP.reset();
- RegionsWithMinOcc.reset();
RegionsWithIGLPInstrs.reset();
runSchedStages();
@@ -1090,8 +1094,7 @@ bool PreRARematStage::initGCNSchedStage() {
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
- DAG.Regions.size() == 1)
+ if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
return false;
// Before performing any IR modification record the parent region of each MI
@@ -1133,11 +1136,6 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
- for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
- DAG.RegionsWithMinOcc[IDX] =
- DAG.Pressure[IDX].getOccupancy(
- DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
-
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
<< DAG.MinOccupancy << '\n');
@@ -1209,11 +1207,15 @@ bool GCNSchedStage::initGCNRegion() {
}
bool UnclusteredHighRPStage::initGCNRegion() {
- // Only reschedule regions with the minimum occupancy or regions that may have
- // spilling (excess register pressure).
- if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
- DAG.MinOccupancy <= InitialOccupancy) &&
- !DAG.RegionsWithExcessRP[RegionIdx])
+ // Only reschedule regions that have excess register pressure (i.e. spilling)
+ // or had minimum occupancy at the beginning of the stage (as long as
+ // rescheduling of previous regions did not make occupancy drop back down to
+ // the initial minimum).
+ unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+ if (!DAG.RegionsWithExcessRP[RegionIdx] &&
+ (DAG.MinOccupancy <= InitialOccupancy ||
+ DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
+ InitialOccupancy))
return false;
return GCNSchedStage::initGCNRegion();
@@ -1278,9 +1280,6 @@ void GCNSchedStage::checkScheduling() {
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
- DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
- DAG.MinOccupancy;
// Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -1314,7 +1313,6 @@ void GCNSchedStage::checkScheduling() {
if (NewOccupancy < DAG.MinOccupancy) {
DAG.MinOccupancy = NewOccupancy;
MFI.limitOccupancy(DAG.MinOccupancy);
- DAG.RegionsWithMinOcc.reset();
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< DAG.MinOccupancy << ".\n");
}
@@ -1336,14 +1334,10 @@ void GCNSchedStage::checkScheduling() {
// Revert if this region's schedule would cause a drop in occupancy or
// spilling.
- if (shouldRevertScheduling(WavesAfter)) {
+ if (shouldRevertScheduling(WavesAfter))
revertScheduling();
- } else {
+ else
DAG.Pressure[RegionIdx] = PressureAfter;
- DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
- DAG.MinOccupancy;
- }
}
unsigned
@@ -1573,9 +1567,6 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
}
void GCNSchedStage::revertScheduling() {
- DAG.RegionsWithMinOcc[RegionIdx] =
- PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
- DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94cd795..32139a9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -250,9 +250,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// limit. Register pressure in these regions usually will result in spilling.
BitVector RegionsWithExcessRP;
- // Regions that has the same occupancy as the latest MinOccupancy
- BitVector RegionsWithMinOcc;
-
// Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT).
BitVector RegionsWithIGLPInstrs;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 0a0a107..0237a60 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -340,6 +340,43 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
+void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
+ const SchedRegion &Region) const {
+ const Function &F = Region.RegionBegin->getMF()->getFunction();
+ Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");
+ if (!PostRADirectionAttr.isValid())
+ return;
+
+ StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
+ if (PostRADirectionStr == "topdown") {
+ Policy.OnlyTopDown = true;
+ Policy.OnlyBottomUp = false;
+ } else if (PostRADirectionStr == "bottomup") {
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = true;
+ } else if (PostRADirectionStr == "bidirectional") {
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+ } else {
+ DiagnosticInfoOptimizationFailure Diag(
+ F, F.getSubprogram(), "invalid value for postRA direction attribute");
+ F.getContext().diagnose(Diag);
+ }
+
+ LLVM_DEBUG({
+ const char *DirStr = "default";
+ if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
+ DirStr = "topdown";
+ else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
+ DirStr = "bottomup";
+ else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
+ DirStr = "bidirectional";
+
+ dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
+ << '\n';
+ });
+}
+
void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
if (isWave32()) {
// Fix implicit $vcc operands after MIParser has verified that they match
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index bdd900d..c84ba1a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -236,6 +236,7 @@ protected:
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
bool HasTanhInsts = false;
+ bool HasTensorCvtLutInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1041,6 +1042,9 @@ public:
void overrideSchedPolicy(MachineSchedPolicy &Policy,
const SchedRegion &Region) const override;
+ void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
+ const SchedRegion &Region) const override;
+
void mirFileLoaded(MachineFunction &MF) const override;
unsigned getMaxNumUserSGPRs() const {
@@ -1408,6 +1412,8 @@ public:
bool hasTanhInsts() const { return HasTanhInsts; }
+ bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
+
bool hasAddPC64Inst() const { return GFX1250Insts; }
bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 2a920f6..4e4660c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -33,8 +33,7 @@ public:
AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value) const override;
@@ -129,9 +128,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (Target.getSpecifier())
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -148,13 +146,13 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value <<= Info.TargetOffset;
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- uint32_t Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits from
// the fixup value.
for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+ Data[i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
std::optional<MCFixupKind>
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 11552b3..9b348d4 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -983,6 +983,7 @@ void SIFrameLowering::emitCSRSpillStores(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1005,6 +1006,12 @@ void SIFrameLowering::emitCSRSpillStores(
}
};
+ for (const Register Reg : make_first_range(WWMScratchRegs)) {
+ if (!MRI.isReserved(Reg)) {
+ MRI.addLiveIn(Reg);
+ MBB.addLiveIn(Reg);
+ }
+ }
StoreWWMRegisters(WWMScratchRegs);
auto EnableAllLanes = [&]() {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ad26757..4d67e4a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16825,56 +16825,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::pair(0U, RC);
}
- if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
- StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
- if (RegName.consume_front("v")) {
+ auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
+ if (Kind != '\0') {
+ if (Kind == 'v') {
RC = &AMDGPU::VGPR_32RegClass;
- } else if (RegName.consume_front("s")) {
+ } else if (Kind == 's') {
RC = &AMDGPU::SGPR_32RegClass;
- } else if (RegName.consume_front("a")) {
+ } else if (Kind == 'a') {
RC = &AMDGPU::AGPR_32RegClass;
}
if (RC) {
- uint32_t Idx;
- if (RegName.consume_front("[")) {
- uint32_t End;
- bool Failed = RegName.consumeInteger(10, Idx);
- Failed |= !RegName.consume_front(":");
- Failed |= RegName.consumeInteger(10, End);
- Failed |= !RegName.consume_back("]");
- if (!Failed) {
- uint32_t Width = (End - Idx + 1) * 32;
- // Prohibit constraints for register ranges with a width that does not
- // match the required type.
- if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
+ if (NumRegs > 1) {
+ if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs())
+ return std::pair(0U, nullptr);
+
+ uint32_t Width = NumRegs * 32;
+ // Prohibit constraints for register ranges with a width that does not
+ // match the required type.
+ if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
+ return std::pair(0U, nullptr);
+
+ MCRegister Reg = RC->getRegister(Idx);
+ if (SIRegisterInfo::isVGPRClass(RC))
+ RC = TRI->getVGPRClassForBitWidth(Width);
+ else if (SIRegisterInfo::isSGPRClass(RC))
+ RC = TRI->getSGPRClassForBitWidth(Width);
+ else if (SIRegisterInfo::isAGPRClass(RC))
+ RC = TRI->getAGPRClassForBitWidth(Width);
+ if (RC) {
+ Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
+ if (!Reg) {
+ // The register class does not contain the requested register,
+ // e.g., because it is an SGPR pair that would violate alignment
+ // requirements.
return std::pair(0U, nullptr);
- MCRegister Reg = RC->getRegister(Idx);
- if (SIRegisterInfo::isVGPRClass(RC))
- RC = TRI->getVGPRClassForBitWidth(Width);
- else if (SIRegisterInfo::isSGPRClass(RC))
- RC = TRI->getSGPRClassForBitWidth(Width);
- else if (SIRegisterInfo::isAGPRClass(RC))
- RC = TRI->getAGPRClassForBitWidth(Width);
- if (RC) {
- Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
- if (!Reg) {
- // The register class does not contain the requested register,
- // e.g., because it is an SGPR pair that would violate alignment
- // requirements.
- return std::pair(0U, nullptr);
- }
- return std::pair(Reg, RC);
}
+ return std::pair(Reg, RC);
}
- } else {
- // Check for lossy scalar/vector conversions.
- if (VT.isVector() && VT.getSizeInBits() != 32)
- return std::pair(0U, nullptr);
- bool Failed = RegName.getAsInteger(10, Idx);
- if (!Failed && Idx < RC->getNumRegs())
- return std::pair(RC->getRegister(Idx), RC);
}
+
+ // Check for lossy scalar/vector conversions.
+ if (VT.isVector() && VT.getSizeInBits() != 32)
+ return std::pair(0U, nullptr);
+ if (Idx < RC->getNumRegs())
+ return std::pair(RC->getRegister(Idx), RC);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044a681..3f61bbd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6304,10 +6304,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
};
if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
- Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
+ Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
// src1 and src2 must be scalar
MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
- MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
const DebugLoc &DL = MI.getDebugLoc();
if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -6315,11 +6319,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
.add(Src1);
Src1.ChangeToRegister(Reg, false);
}
- if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
- Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
- .add(Src2);
- Src2.ChangeToRegister(Reg, false);
+ if (VOP3Idx[2] != -1) {
+ MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
+ if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src2);
+ Src2.ChangeToRegister(Reg, false);
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a3e20ba..4698a58 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1774,6 +1774,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
!eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
+ !eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
!eq(VT.Size, 16) : op16,
@@ -1924,6 +1925,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
+ !eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}
@@ -2935,6 +2937,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
+def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>;
+def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>;
+def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
@@ -2948,6 +2953,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
+def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>;
+def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>;
def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>;
def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>;
def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>;
@@ -2955,11 +2962,26 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
+def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>;
+def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>;
+def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>;
+def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
+def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
+def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
+def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
+def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>;
+def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>;
+def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 36d1a3b..08d07c9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
// True 16 Operands
def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5827f18..65fa088 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1548,6 +1548,42 @@ bool shouldEmitConstantsToTextSection(const Triple &TT) {
return TT.getArch() == Triple::r600;
}
+static bool isValidRegPrefix(char C) {
+ return C == 'v' || C == 's' || C == 'a';
+}
+
+std::tuple<char, unsigned, unsigned>
+parseAsmConstraintPhysReg(StringRef Constraint) {
+ StringRef RegName = Constraint;
+ if (!RegName.consume_front("{") || !RegName.consume_back("}"))
+ return {};
+
+ char Kind = RegName.front();
+ if (!isValidRegPrefix(Kind))
+ return {};
+
+ RegName = RegName.drop_front();
+ if (RegName.consume_front("[")) {
+ unsigned Idx, End;
+ bool Failed = RegName.consumeInteger(10, Idx);
+ Failed |= !RegName.consume_front(":");
+ Failed |= RegName.consumeInteger(10, End);
+ Failed |= !RegName.consume_back("]");
+ if (!Failed) {
+ unsigned NumRegs = End - Idx + 1;
+ if (NumRegs > 1)
+ return {Kind, Idx, NumRegs};
+ }
+ } else {
+ unsigned Idx;
+ bool Failed = RegName.getAsInteger(10, Idx);
+ if (!Failed)
+ return {Kind, Idx, 1};
+ }
+
+ return {};
+}
+
std::pair<unsigned, unsigned>
getIntegerPairAttribute(const Function &F, StringRef Name,
std::pair<unsigned, unsigned> Default,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 74d59f4..1252e35 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1012,6 +1012,12 @@ bool isReadOnlySegment(const GlobalValue *GV);
/// target triple \p TT, false otherwise.
bool shouldEmitConstantsToTextSection(const Triple &TT);
+/// Returns a valid charcode or 0 in the first entry if this is a valid physical
+/// register constraint. Followed by the start register number, and the register
+/// width. Does not validate the number of registers exists in the class.
+std::tuple<char, unsigned, unsigned>
+parseAsmConstraintPhysReg(StringRef Constraint);
+
/// \returns Integer value requested using \p F's \p Name attribute.
///
/// \returns \p Default if attribute is not present.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 1ffe39d..f4b6af6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1053,6 +1053,14 @@ def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped
let HasExtDPP = 0;
}
+class VOP3_PERMLANE_NOOPSEL_Profile<VOPProfile P> : VOP3_Profile<P> {
+ let Ins64 = !con((ins VRegSrc_32:$src0, SSrc_b32:$src1),
+ !if(P.HasSrc2, (ins SSrc_b32:$src2), (ins)));
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
+
def opsel_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(
N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
@@ -1136,6 +1144,18 @@ class PermlaneVarPat<SDPatternOperator permlane,
VGPR_32:$src1, VGPR_32:$vdst_in)
>;
+class PermlaneNoDppPat3Src<SDPatternOperator permlane,
+ Instruction inst> : GCNPat<
+ (permlane i32:$src0, i32:$src1, i32:$src2),
+ (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2)
+>;
+
+class PermlaneNoDppPat2Src<SDPatternOperator permlane,
+ Instruction inst> : GCNPat<
+ (permlane i32:$src0, i32:$src1),
+ (inst VGPR_32:$src0, SCSrc_b32:$src1)
+>;
+
class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f> {
let HasClamp = 0;
let HasOMod = 0;
@@ -1522,6 +1542,20 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
+let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
+ defm V_PERMLANE_BCAST_B32 : VOP3Inst<"v_permlane_bcast_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_PERMLANE_UP_B32 : VOP3Inst<"v_permlane_up_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_PERMLANE_DOWN_B32 : VOP3Inst<"v_permlane_down_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_PERMLANE_XOR_B32 : VOP3Inst<"v_permlane_xor_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>;
+
+ def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64>;
+ def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64>;
+ def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64>;
+ def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64>;
+ def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>;
+} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
+
let HasClamp = 0, HasModifiers = 1 in {
def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
@@ -1692,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no
}
}
+let HasExtVOP3DPP = 0, HasModifiers = 0 in {
+def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>;
+def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>;
+def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>;
+}
+
let Src0RC64 = VSrc_NoInline_v2f16 in {
def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
@@ -1737,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>;
defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>;
+ defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>;
+ defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>;
+ defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>;
+ defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>;
+ defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>;
+ defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>;
} // End Constraints = "@earlyclobber $vdst"
defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>;
@@ -1744,6 +1790,44 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
} // End ReadsModeReg = 0
+ let Constraints = "@earlyclobber $vdst" in {
+ let WaveSizePredicate = isWave32 in {
+ defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>;
+ defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>;
+ defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>;
+ defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>;
+ defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>;
+ } // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>;
+ defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>;
+
+ let WaveSizePredicate = isWave32 in {
+ defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
+ } // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
+ } // End Constraints = "@earlyclobber $vdst"
+
let True16Predicate = UseRealTrue16Insts in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
@@ -1754,6 +1838,12 @@ let SubtargetPredicate = isGFX1250Plus in {
}
} // End SubtargetPredicate = isGFX1250Plus
+let SubtargetPredicate = HasTensorCvtLutInsts in {
+ defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>;
+ defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>;
+ defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>;
+} // End SubtargetPredicate = HasTensorCvtLutInsts
+
class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
@@ -1973,6 +2063,11 @@ defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>;
defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
+defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>;
+defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>;
+defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>;
+defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>;
+defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>;
//===----------------------------------------------------------------------===//
// GFX11, GFX12
@@ -2147,6 +2242,9 @@ let AssemblerPredicate = isGFX11Plus in {
}
// These instructions differ from GFX12 variant by supporting DPP:
+defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
+defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
+defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
@@ -2159,6 +2257,42 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
+defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>;
+defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>;
+defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>;
+defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>;
+defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>;
+defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>;
+defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>;
+defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>;
+defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>;
+defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>;
+defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>;
+defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>;
+defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>;
+defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>;
+defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>;
+defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>;
+defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>;
+defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
+defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
+defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
+defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index bd4b75f..9366256 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -5521,18 +5521,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
if (Op.getValueType().isInteger()) {
- // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
- // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
- // supported types.
- if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
- CTVal->isOne() && CFVal->isAllOnes() &&
- LHS.getValueType() == TrueVal.getValueType()) {
- EVT VT = LHS.getValueType();
- SDValue Shift =
- DAG.getNode(ISD::SRA, dl, VT, LHS,
- DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
- return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
- }
// Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
// (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 146fc67..c221d22 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1108,9 +1108,8 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F,
}
void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -1124,14 +1123,15 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
return; // Doesn't change encoding.
const unsigned NumBytes = getFixupKindNumBytes(Kind);
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FullSizeBytes;
if (Endian == llvm::endianness::big) {
FullSizeBytes = getFixupKindContainerSizeBytes(Kind);
- assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
+ assert(Fixup.getOffset() + FullSizeBytes <= F.getSize() &&
+ "Invalid fixup size!");
assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
}
@@ -1141,7 +1141,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx =
Endian == llvm::endianness::little ? i : (FullSizeBytes - 1 - i);
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 07d2cf7..2844232 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -40,8 +40,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 128cc0b..05a7d03 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -368,9 +368,8 @@ AVRAsmBackend::createObjectTargetWriter() const {
}
void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// AVR sets the fixup value to bypass the assembly time overflow with a
// relocation.
if (IsResolved) {
@@ -397,14 +396,14 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i < NumBytes; ++i) {
uint8_t mask = (((Value >> (i * 8)) & 0xff));
- Data[Offset + i] |= mask;
+ Data[i] |= mask;
}
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 68c839e..9633669 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -38,8 +38,7 @@ public:
createObjectTargetWriter() const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index dda8753..53933f9 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,7 @@ public:
~BPFAsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -66,35 +65,32 @@ bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
}
void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
if (Fixup.getKind() == FK_SecRel_8) {
// The Value is 0 for global variables, and the in-section offset
// for static variables. Write to the immediate field of the inst.
assert(Value <= UINT32_MAX);
- support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4],
- static_cast<uint32_t>(Value),
+ support::endian::write<uint32_t>(Data + 4, static_cast<uint32_t>(Value),
Endian);
} else if (Fixup.getKind() == FK_Data_4 && !Fixup.isPCRel()) {
- support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
+ support::endian::write<uint32_t>(Data, Value, Endian);
} else if (Fixup.getKind() == FK_Data_8) {
- support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian);
+ support::endian::write<uint64_t>(Data, Value, Endian);
} else if (Fixup.getKind() == FK_Data_4 && Fixup.isPCRel()) {
Value = (uint32_t)((Value - 8) / 8);
if (Endian == llvm::endianness::little) {
- Data[Fixup.getOffset() + 1] = 0x10;
- support::endian::write32le(&Data[Fixup.getOffset() + 4], Value);
+ Data[1] = 0x10;
+ support::endian::write32le(Data + 4, Value);
} else {
- Data[Fixup.getOffset() + 1] = 0x1;
- support::endian::write32be(&Data[Fixup.getOffset() + 4], Value);
+ Data[1] = 0x1;
+ support::endian::write32be(Data + 4, Value);
}
} else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) {
// The input Value represents the number of bytes.
Value = (uint32_t)((Value - 8) / 8);
- support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value,
- Endian);
+ support::endian::write<uint32_t>(Data + 4, Value, Endian);
} else {
assert(Fixup.getKind() == FK_Data_2 && Fixup.isPCRel());
@@ -103,8 +99,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
report_fatal_error("Branch target out of insn range");
Value = (uint16_t)((Value - 8) / 8);
- support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
- Endian);
+ support::endian::write<uint16_t>(Data + 2, Value, Endian);
}
}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index 694d9ea..6964998 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -197,9 +197,8 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F,
}
void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -217,10 +216,10 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
@@ -228,14 +227,14 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
bool IsInstFixup = (Kind >= FirstTargetFixupKind);
if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) {
- Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff);
- Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff);
- Data[Offset + 2] |= uint8_t(Value & 0xff);
- Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff);
+ Data[0] |= uint8_t((Value >> 16) & 0xff);
+ Data[1] |= uint8_t((Value >> 24) & 0xff);
+ Data[2] |= uint8_t(Value & 0xff);
+ Data[3] |= uint8_t((Value >> 8) & 0xff);
} else {
for (unsigned I = 0; I != NumBytes; I++) {
unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I);
- Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (I * 8)) & 0xff);
}
}
}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 1c8516f..5d8826a 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -25,8 +25,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 5323be6..9a14c01 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -78,8 +78,7 @@ public:
~DXILAsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override {}
+ uint8_t *Data, uint64_t Value, bool IsResolved) override {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 7d3074b..1a0f1ab 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -402,8 +402,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char> Data, uint64_t FixupValue,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t FixupValue, bool IsResolved) override;
bool isInstRelaxable(MCInst const &HMI) const {
const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI);
@@ -649,8 +648,7 @@ public:
} // namespace
void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *InstAddr,
uint64_t FixupValue, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup))
IsResolved = false;
@@ -667,10 +665,9 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// LLVM gives us an encoded value, we have to convert it back
// to a real offset before we can use it.
- uint32_t Offset = Fixup.getOffset();
unsigned NumBytes = getFixupKindNumBytes(Kind);
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
- char *InstAddr = Data.data() + Offset;
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
Value = adjustFixupValue(Kind, FixupValue);
if (!Value)
@@ -757,8 +754,8 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |=
(InstAddr[i] << (i * 8)) & (0xff << (i * 8));
dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x";
- dbgs().write_hex(FixupValue)
- << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x";
+ dbgs().write_hex(FixupValue) << ": Offset=" << Fixup.getOffset()
+ << ": Size=" << F.getSize() << ": OInst=0x";
dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc););
// For each byte of the fragment that the fixup touches, mask in the
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 83d1697..3112dea 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -48,8 +48,7 @@ public:
: MCAsmBackend(llvm::endianness::big), OSType(OST) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -72,9 +71,8 @@ bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
}
void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
@@ -85,7 +83,6 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Where in the object and where the number of bytes that need
// fixing up
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
unsigned FullSize = 4;
@@ -95,8 +92,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Load instruction and apply value
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = (FullSize - 1 - i);
- CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx]))
- << (i * 8);
+ CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Idx])) << (i * 8);
}
uint64_t Mask =
@@ -106,7 +102,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Write out the fixed up bytes back to the code/data bits.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = (FullSize - 1 - i);
- Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
+ Data[Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index d9ea88c..fda9d97 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -131,19 +131,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup,
- MutableArrayRef<char> Data, uint64_t Value) {
+static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data,
+ uint64_t Value) {
unsigned I;
- for (I = 0; I != Data.size() && Value; ++I, Value >>= 7)
+ for (I = 0; Value; ++I, Value >>= 7)
Data[I] |= uint8_t(Value & 0x7f);
if (Value)
Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!");
}
void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
@@ -166,14 +165,14 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned I = 0; I != NumBytes; ++I) {
- Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[I] |= uint8_t((Value >> (I * 8)) & 0xff);
}
}
@@ -274,15 +273,14 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
int64_t LineDelta = F.getDwarfLineDelta();
const MCExpr &AddrDelta = F.getDwarfAddrDelta();
- SmallVector<MCFixup, 1> Fixups;
size_t OldSize = F.getVarSize();
int64_t Value;
if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
return false;
- bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm);
- assert(IsAbsolute && "CFA with invalid expression");
- (void)IsAbsolute;
+ [[maybe_unused]] bool IsAbsolute =
+ AddrDelta.evaluateKnownAbsolute(Value, *Asm);
+ assert(IsAbsolute);
SmallVector<char> Data;
raw_svector_ostream OS(Data);
@@ -293,33 +291,23 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
encodeSLEB128(LineDelta, OS);
}
- unsigned Offset;
- std::pair<MCFixupKind, MCFixupKind> FK;
-
// According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
// takes a single unsigned half (unencoded) operand. The maximum encodable
// value is therefore 65535. Set a conservative upper bound for relaxation.
+ unsigned PCBytes;
if (Value > 60000) {
unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
-
- OS << uint8_t(dwarf::DW_LNS_extended_op);
- encodeULEB128(PtrSize + 1, OS);
-
- OS << uint8_t(dwarf::DW_LNE_set_address);
- Offset = OS.tell();
assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size");
- FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64);
+ PCBytes = PtrSize;
+ OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PtrSize + 1)
+ << uint8_t(dwarf::DW_LNE_set_address);
OS.write_zeros(PtrSize);
} else {
+ PCBytes = 2;
OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
- Offset = OS.tell();
- FK = getRelocPairForSize(16);
support::endian::write<uint16_t>(OS, 0, llvm::endianness::little);
}
-
- const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
- Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK)));
- Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK)));
+ auto Offset = OS.tell() - PCBytes;
if (LineDelta == INT64_MAX) {
OS << uint8_t(dwarf::DW_LNS_extended_op);
@@ -330,7 +318,8 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
}
F.setVarContents(Data);
- F.setVarFixups(Fixups);
+ F.setVarFixups({MCFixup::create(Offset, &AddrDelta,
+ MCFixup::getDataKindForSize(PCBytes))});
WasRelaxed = OldSize != Data.size();
return true;
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 3d929fc..1f13601 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -42,8 +42,7 @@ public:
uint64_t &FixedValue, bool IsResolved);
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index 5e03903..fe83dc6 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -53,8 +53,7 @@ public:
.Default(false)) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override;
@@ -78,14 +77,13 @@ public:
} // end anonymous namespace
void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
- assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!");
// Check that uppper bits are either all zeros or all ones.
// Specifically ignore overflow/underflow as long as the leakage is
// limited to the lower bits. This is to remain compatible with
@@ -95,8 +93,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Write in Big Endian
for (unsigned i = 0; i != Size; ++i)
- Data[Fixup.getOffset() + i] =
- uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8));
+ Data[i] = uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8));
}
/// cc—Carry clear GE—Greater than or equal
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 29e5bfa..d892b3a 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -36,8 +36,7 @@ public:
~MSP430AsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
@@ -105,9 +104,8 @@ uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup,
}
void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
Value = adjustFixupValue(Fixup, Value, getContext());
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
@@ -117,15 +115,14 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
-
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index c2169be..33aab71 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -283,9 +283,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
/// data fragment, at the offset specified by the fixup and following the
/// fixup kind as appropriate.
void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (shouldForceRelocation(Fixup))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -297,7 +296,6 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
return; // Doesn't change encoding.
// Where do we start in the object
- unsigned Offset = Fixup.getOffset();
// Number of bytes we need to fixup
unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
// Used to point to big endian bytes
@@ -328,7 +326,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
unsigned Idx = Endian == llvm::endianness::little
? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
: (FullSize - 1 - i);
- CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+ CurVal |= (uint64_t)((uint8_t)Data[Idx]) << (i * 8);
}
uint64_t Mask = ((uint64_t)(-1) >>
@@ -340,7 +338,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
unsigned Idx = Endian == llvm::endianness::little
? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
: (FullSize - 1 - i);
- Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+ Data[Idx] = (uint8_t)((CurVal >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 816626d..40b5853 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -40,8 +40,7 @@ public:
createObjectTargetWriter() const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index d9680c7..7a8395a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
// fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(8, 0);
@@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_TPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_TPREL64);
S.appendContents(8, 0);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 65d1be3..15f45a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -382,6 +382,54 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
}
}
+// We return an EVT that can hold N VTs
+// If the VT is a vector, the resulting EVT is a flat vector with the same
+// element type as VT's element type.
+static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
+ if (N == 1)
+ return VT;
+
+ return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
+ VT.getVectorNumElements() * N)
+ : EVT::getVectorVT(C, VT, N);
+}
+
+static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ if (V.getValueType() == VT) {
+ assert(I == 0 && "Index must be 0 for scalar value");
+ return V;
+ }
+
+ if (!VT.isVector())
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
+ DAG.getVectorIdxConstant(I, dl));
+
+ return DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, dl, VT, V,
+ DAG.getVectorIdxConstant(I * VT.getVectorNumElements(), dl));
+}
+
+template <typename T>
+static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
+ SelectionDAG &DAG, T GetElement) {
+ if (N == 1)
+ return GetElement(0);
+
+ SmallVector<SDValue, 8> Values;
+ for (const unsigned I : llvm::seq(N)) {
+ SDValue Val = GetElement(I);
+ if (Val.getValueType().isVector())
+ DAG.ExtractVectorElements(Val, Values);
+ else
+ Values.push_back(Val);
+ }
+
+ EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
+ Values.size());
+ return DAG.getBuildVector(VT, dl, Values);
+}
+
/// PromoteScalarIntegerPTX
/// Used to make sure the arguments/returns are suitable for passing
/// and promote them to a larger size if they're not.
@@ -420,9 +468,10 @@ static EVT promoteScalarIntegerPTX(const EVT VT) {
// parameter starting at index Idx using a single vectorized op of
// size AccessSize. If so, it returns the number of param pieces
// covered by the vector op. Otherwise, it returns 1.
-static unsigned CanMergeParamLoadStoresStartingAt(
+template <typename T>
+static unsigned canMergeParamLoadStoresStartingAt(
unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
+ const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
// Can't vectorize if param alignment is not sufficient.
if (ParamAlignment < AccessSize)
@@ -472,10 +521,11 @@ static unsigned CanMergeParamLoadStoresStartingAt(
// of the same size as ValueVTs indicating how each piece should be
// loaded/stored (i.e. as a scalar, or as part of a vector
// load/store).
+template <typename T>
static SmallVector<unsigned, 16>
VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets,
- Align ParamAlignment, bool IsVAArg = false) {
+ const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
+ bool IsVAArg = false) {
// Set vector size to match ValueVTs and mark all elements as
// scalars by default.
@@ -486,7 +536,7 @@ VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
const auto GetNumElts = [&](unsigned I) -> unsigned {
for (const unsigned AccessSize : {16, 8, 4, 2}) {
- const unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+ const unsigned NumElts = canMergeParamLoadStoresStartingAt(
I, AccessSize, ValueVTs, Offsets, ParamAlignment);
assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
"Unexpected vectorization size");
@@ -1384,6 +1434,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Type *RetTy = CLI.RetTy;
const CallBase *CB = CLI.CB;
const DataLayout &DL = DAG.getDataLayout();
+ LLVMContext &Ctx = *DAG.getContext();
const auto GetI32 = [&](const unsigned I) {
return DAG.getConstant(I, dl, MVT::i32);
@@ -1476,15 +1527,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const SDValue ParamSymbol =
getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
-
assert((!IsByVal || Arg.IndirectType) &&
"byval arg must have indirect type");
Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
- ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
- assert(VTs.size() == Offsets.size() && "Size mismatch");
- assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch");
const Align ArgAlign = [&]() {
if (IsByVal) {
@@ -1492,17 +1537,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// so we don't need to worry whether it's naturally aligned or not.
// See TargetLowering::LowerCallTo().
const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
- const Align ByValAlign = getFunctionByValParamAlign(
- CB->getCalledFunction(), ETy, InitialAlign, DL);
- if (IsVAArg)
- VAOffset = alignTo(VAOffset, ByValAlign);
- return ByValAlign;
+ return getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
+ InitialAlign, DL);
}
return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
}();
- const unsigned TypeSize = DL.getTypeAllocSize(ETy);
- assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
+ const unsigned TySize = DL.getTypeAllocSize(ETy);
+ assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
"type size mismatch");
const SDValue ArgDeclare = [&]() {
@@ -1510,105 +1552,120 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return VADeclareParam;
if (IsByVal || shouldPassAsArray(Arg.Ty))
- return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+ return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
"Only int and float types are supported as non-array arguments");
- return MakeDeclareScalarParam(ParamSymbol, TypeSize);
+ return MakeDeclareScalarParam(ParamSymbol, TySize);
}();
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
- // than 32-bits are sign extended or zero extended, depending on
- // whether they are signed or unsigned types. This case applies
- // only to scalar parameters and not to aggregate values.
- const bool ExtendIntegerParam =
- Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
+ if (IsByVal) {
+ assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
+ SDValue SrcPtr = ArgOutVals[0];
+ const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
+ const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
- const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
- const MaybeAlign PartAlign) {
- if (IsByVal) {
- SDValue Ptr = ArgOutVals[0];
- auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
- SDValue SrcAddr =
- DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
-
- return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
+ if (IsVAArg)
+ VAOffset = alignTo(VAOffset, ArgAlign);
+
+ SmallVector<EVT, 4> ValueVTs, MemVTs;
+ SmallVector<TypeSize, 4> Offsets;
+ ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
+
+ unsigned J = 0;
+ const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
+ for (const unsigned NumElts : VI) {
+ EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
+ Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
+ SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
+ SDValue SrcLoad =
+ DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
+
+ TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
+ Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
+ SDValue ParamAddr =
+ DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
+ SDValue StoreParam =
+ DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), ParamAlign);
+ CallPrereqs.push_back(StoreParam);
+
+ J += NumElts;
}
- SDValue StVal = ArgOutVals[I];
- assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
- StVal.getValueType() &&
- "OutVal type should always be legal");
-
- const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
- const EVT StoreVT =
- ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
-
- return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
- };
-
- const auto VectorInfo =
- VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
-
- unsigned J = 0;
- for (const unsigned NumElts : VectorInfo) {
- const int CurOffset = Offsets[J];
- const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
-
- if (IsVAArg && !IsByVal)
- // Align each part of the variadic argument to their type.
- VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
-
- assert((IsVAArg || VAOffset == 0) &&
- "VAOffset must be 0 for non-VA args");
+ if (IsVAArg)
+ VAOffset += TySize;
+ } else {
+ SmallVector<EVT, 16> VTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset);
+ assert(VTs.size() == Offsets.size() && "Size mismatch");
+ assert(VTs.size() == ArgOuts.size() && "Size mismatch");
+
+ // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+ // than 32-bits are sign extended or zero extended, depending on
+ // whether they are signed or unsigned types. This case applies
+ // only to scalar parameters and not to aggregate values.
+ const bool ExtendIntegerParam =
+ Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
+
+ const auto GetStoredValue = [&](const unsigned I) {
+ SDValue StVal = ArgOutVals[I];
+ assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+ StVal.getValueType() &&
+ "OutVal type should always be legal");
+
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT StoreVT =
+ ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+ return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
+ };
+
+ unsigned J = 0;
+ const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
+ for (const unsigned NumElts : VI) {
+ const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
+
+ unsigned Offset;
+ if (IsVAArg) {
+ // TODO: We may need to support vector types that can be passed
+ // as scalars in variadic arguments.
+ assert(NumElts == 1 &&
+ "Vectorization should be disabled for vaargs.");
+
+ // Align each part of the variadic argument to their type.
+ VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
+ Offset = VAOffset;
+
+ const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
+ VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
+ } else {
+ assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
+ Offset = Offsets[J];
+ }
- const unsigned Offset =
- (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
- SDValue Ptr =
- DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
- const MaybeAlign CurrentAlign = ExtendIntegerParam
- ? MaybeAlign(std::nullopt)
- : commonAlignment(ArgAlign, Offset);
+ const MaybeAlign CurrentAlign = ExtendIntegerParam
+ ? MaybeAlign(std::nullopt)
+ : commonAlignment(ArgAlign, Offset);
- SDValue Val;
- if (NumElts == 1) {
- Val = GetStoredValue(J, EltVT, CurrentAlign);
- } else {
- SmallVector<SDValue, 8> StoreVals;
- for (const unsigned K : llvm::seq(NumElts)) {
- SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
- if (ValJ.getValueType().isVector())
- DAG.ExtractVectorElements(ValJ, StoreVals);
- else
- StoreVals.push_back(ValJ);
- }
+ SDValue Val =
+ getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
+ return GetStoredValue(J + K);
+ });
- EVT VT = EVT::getVectorVT(
- *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
- Val = DAG.getBuildVector(VT, dl, StoreVals);
- }
+ SDValue StoreParam =
+ DAG.getStore(ArgDeclare, dl, Val, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+ CallPrereqs.push_back(StoreParam);
- SDValue StoreParam =
- DAG.getStore(ArgDeclare, dl, Val, Ptr,
- MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
- CallPrereqs.push_back(StoreParam);
-
- // TODO: We may need to support vector types that can be passed
- // as scalars in variadic arguments.
- if (IsVAArg && !IsByVal) {
- assert(NumElts == 1 &&
- "Vectorization is expected to be disabled for variadics.");
- const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
- VAOffset +=
- DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
+ J += NumElts;
}
-
- J += NumElts;
}
- if (IsVAArg && IsByVal)
- VAOffset += TypeSize;
}
// Handle Result
@@ -1676,17 +1733,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallPrereqs.push_back(PrototypeDeclare);
}
- if (ConvertToIndirectCall) {
- // Copy the function ptr to a ptx register and use the register to call the
- // function.
- const MVT DestVT = Callee.getValueType().getSimpleVT();
- MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT));
- auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
- Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
- }
-
const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
const unsigned NumArgs =
std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
@@ -1703,10 +1749,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!Ins.empty()) {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+ ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
assert(VTs.size() == Ins.size() && "Bad value decomposition");
const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
// 32-bits are sign extended or zero extended, depending on whether
@@ -1714,9 +1761,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const bool ExtendIntegerRetVal =
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
- const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
- for (const unsigned NumElts : VectorInfo) {
+ const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+ for (const unsigned NumElts : VI) {
const MaybeAlign CurrentAlign =
ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
: commonAlignment(RetAlign, Offsets[I]);
@@ -1724,16 +1771,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
const EVT LoadVT =
ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
-
- const unsigned PackingAmt =
- LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
-
- const EVT VecVT = NumElts == 1 ? LoadVT
- : EVT::getVectorVT(*DAG.getContext(),
- LoadVT.getScalarType(),
- NumElts * PackingAmt);
-
- const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
SDValue Ptr =
DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
@@ -1742,17 +1780,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
LoadChains.push_back(R.getValue(1));
-
- if (NumElts == 1)
- ProxyRegOps.push_back(R);
- else
- for (const unsigned J : llvm::seq(NumElts)) {
- SDValue Elt = DAG.getNode(
- LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
- : ISD::EXTRACT_VECTOR_ELT,
- dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
- ProxyRegOps.push_back(Elt);
- }
+ for (const unsigned J : llvm::seq(NumElts))
+ ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
I += NumElts;
}
}
@@ -3227,11 +3256,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- MachineFunction &MF = DAG.getMachineFunction();
const DataLayout &DL = DAG.getDataLayout();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- const Function *F = &MF.getFunction();
+ const Function &F = DAG.getMachineFunction().getFunction();
SDValue Root = DAG.getRoot();
SmallVector<SDValue, 16> OutChains;
@@ -3247,7 +3275,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// See similar issue in LowerCall.
auto AllIns = ArrayRef(Ins);
- for (const auto &Arg : F->args()) {
+ for (const auto &Arg : F.args()) {
const auto ArgIns = AllIns.take_while(
[&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
AllIns = AllIns.drop_front(ArgIns.size());
@@ -3287,7 +3315,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
SDValue P;
- if (isKernelFunction(*F)) {
+ if (isKernelFunction(F)) {
P = ArgSymbol;
P.getNode()->setIROrder(Arg.getArgNo() + 1);
} else {
@@ -3305,43 +3333,27 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
assert(VTs.size() == Offsets.size() && "Size mismatch");
const Align ArgAlign = getFunctionArgumentAlignment(
- F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
+ &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
- const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
unsigned I = 0;
- for (const unsigned NumElts : VectorInfo) {
+ const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+ for (const unsigned NumElts : VI) {
// i1 is loaded/stored as i8
const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
- // If the element is a packed type (ex. v2f16, v4i8, etc) holding
- // multiple elements.
- const unsigned PackingAmt =
- LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
-
- const EVT VecVT =
- NumElts == 1
- ? LoadVT
- : EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(),
- NumElts * PackingAmt);
+ const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext());
SDValue VecAddr = DAG.getObjectPtrOffset(
dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
- const MaybeAlign PartAlign = commonAlignment(ArgAlign, Offsets[I]);
+ const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
SDValue P =
DAG.getLoad(VecVT, dl, Root, VecAddr,
MachinePointerInfo(ADDRESS_SPACE_PARAM), PartAlign,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
- if (P.getNode())
- P.getNode()->setIROrder(Arg.getArgNo() + 1);
+ P.getNode()->setIROrder(Arg.getArgNo() + 1);
for (const unsigned J : llvm::seq(NumElts)) {
- SDValue Elt =
- NumElts == 1
- ? P
- : DAG.getNode(LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
- : ISD::EXTRACT_VECTOR_ELT,
- dl, LoadVT, P,
- DAG.getVectorIdxConstant(J * PackingAmt, dl));
+ SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
DAG, dl);
@@ -3364,9 +3376,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
- const MachineFunction &MF = DAG.getMachineFunction();
- const Function &F = MF.getFunction();
- Type *RetTy = MF.getFunction().getReturnType();
+ const Function &F = DAG.getMachineFunction().getFunction();
+ Type *RetTy = F.getReturnType();
if (RetTy->isVoidTy()) {
assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
@@ -3374,10 +3385,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
const DataLayout &DL = DAG.getDataLayout();
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
- assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+
+ const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
+ const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
// 32-bits are sign extended or zero extended, depending on whether
@@ -3385,6 +3395,11 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const bool ExtendIntegerRetVal =
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+ SmallVector<EVT, 16> VTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+ assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+
const auto GetRetVal = [&](unsigned I) -> SDValue {
SDValue RetVal = OutVals[I];
assert(promoteScalarIntegerPTX(RetVal.getValueType()) ==
@@ -3397,33 +3412,16 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
};
- const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
- const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
- for (const unsigned NumElts : VectorInfo) {
+ const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+ for (const unsigned NumElts : VI) {
const MaybeAlign CurrentAlign = ExtendIntegerRetVal
? MaybeAlign(std::nullopt)
: commonAlignment(RetAlign, Offsets[I]);
- SDValue Val;
- if (NumElts == 1) {
- Val = GetRetVal(I);
- } else {
- SmallVector<SDValue, 4> StoreVals;
- for (const unsigned J : llvm::seq(NumElts)) {
- SDValue ValJ = GetRetVal(I + J);
- if (ValJ.getValueType().isVector())
- DAG.ExtractVectorElements(ValJ, StoreVals);
- else
- StoreVals.push_back(ValJ);
- }
-
- EVT VT = EVT::getVectorVT(F.getContext(), StoreVals[0].getValueType(),
- StoreVals.size());
- Val = DAG.getBuildVector(VT, dl, StoreVals);
- }
+ SDValue Val = getBuildVectorizedValue(
+ NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
- const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
SDValue Ptr =
DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index d8047d3..2ae7520 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1602,8 +1602,6 @@ foreach is_convergent = [0, 1] in {
}
defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix);
- def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, globaladdr:$addr, imm:$proto),
- (call_inst (to_tglobaladdr $addr), imm:$rets, imm:$params, imm:$proto)>;
def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i32:$addr, imm:$proto),
(call_inst $addr, imm:$rets, imm:$params, imm:$proto)>;
def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i64:$addr, imm:$proto),
@@ -1612,10 +1610,6 @@ foreach is_convergent = [0, 1] in {
defvar call_uni_inst = !cast<NVPTXInst>("CALL_UNI" # convergent_suffix);
def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, globaladdr:$addr, 0),
(call_uni_inst (to_tglobaladdr $addr), imm:$rets, imm:$params)>;
- def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i32:$addr, 0),
- (call_uni_inst $addr, imm:$rets, imm:$params)>;
- def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i64:$addr, 0),
- (call_uni_inst $addr, imm:$rets, imm:$params)>;
}
def DECLARE_PARAM_array :
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 0e8828f..ec97e2e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -93,8 +93,8 @@ public:
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) override;
+ const MCValue &Target, uint8_t *Data, uint64_t Value,
+ bool IsResolved) override;
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) {
// If there is a @ specifier, unless it is optimized out (e.g. constant @l),
@@ -185,9 +185,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &TargetVal,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &TargetVal, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to
// reference the null symbol.
auto Target = TargetVal;
@@ -205,7 +204,6 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (!Value)
return; // Doesn't change encoding.
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = getFixupKindNumBytes(Kind);
// For each byte of the fragment that the fixup touches, mask in the bits
@@ -213,7 +211,7 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1 - i);
- Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 459525e..f179873 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7296,9 +7296,17 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
ValVT.isInteger() &&
ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
- SDValue ArgValueTrunc = DAG.getNode(
- ISD::TRUNCATE, dl, ArgVT.getSimpleVT() == MVT::i1 ? MVT::i8 : ArgVT,
- ArgValue);
+ // It is possible to have either real integer values
+ // or integers that were not originally integers.
+ // In the latter case, these could have came from structs,
+ // and these integers would not have an extend on the parameter.
+ // Since these types of integers do not have an extend specified
+ // in the first place, the type of extend that we do should not matter.
+ EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
+ ? MVT::i8
+ : ArgVT;
+ SDValue ArgValueTrunc =
+ DAG.getNode(ISD::TRUNCATE, dl, TruncatedArgVT, ArgValue);
SDValue ArgValueExt =
ArgSignExt ? DAG.getSExtOrTrunc(ArgValueTrunc, dl, ValVT)
: DAG.getZExtOrTrunc(ArgValueTrunc, dl, ValVT);
diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
index 5eb1f01..b7e2263 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -100,10 +100,14 @@ bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
@@ -189,10 +193,14 @@ bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
// Keep clustered nodes together.
- const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
- const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
- if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
- CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 82e3b5c..eb7460e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -327,19 +327,19 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
- MCContext &C = getContext();
-
int64_t LineDelta = F.getDwarfLineDelta();
const MCExpr &AddrDelta = F.getDwarfAddrDelta();
- SmallVector<MCFixup, 1> Fixups;
size_t OldSize = F.getVarSize();
int64_t Value;
+ // If the label difference can be resolved, use the default handling, which
+ // utilizes a shorter special opcode.
+ if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
+ return false;
[[maybe_unused]] bool IsAbsolute =
AddrDelta.evaluateKnownAbsolute(Value, *Asm);
assert(IsAbsolute && "CFA with invalid expression");
- Fixups.clear();
SmallVector<char> Data;
raw_svector_ostream OS(Data);
@@ -349,33 +349,21 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
encodeSLEB128(LineDelta, OS);
}
- unsigned Offset;
- std::pair<MCFixupKind, MCFixupKind> Fixup;
-
// According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
// takes a single unsigned half (unencoded) operand. The maximum encodable
// value is therefore 65535. Set a conservative upper bound for relaxation.
+ unsigned PCBytes;
if (Value > 60000) {
- unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
-
- OS << uint8_t(dwarf::DW_LNS_extended_op);
- encodeULEB128(PtrSize + 1, OS);
-
- OS << uint8_t(dwarf::DW_LNE_set_address);
- Offset = OS.tell();
- assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size");
- Fixup = RISCV::getRelocPairForSize(PtrSize);
- OS.write_zeros(PtrSize);
+ PCBytes = getContext().getAsmInfo()->getCodePointerSize();
+ OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PCBytes + 1)
+ << uint8_t(dwarf::DW_LNE_set_address);
+ OS.write_zeros(PCBytes);
} else {
+ PCBytes = 2;
OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
- Offset = OS.tell();
- Fixup = RISCV::getRelocPairForSize(2);
support::endian::write<uint16_t>(OS, 0, llvm::endianness::little);
}
-
- const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
- Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(Fixup)));
- Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(Fixup)));
+ auto Offset = OS.tell() - PCBytes;
if (LineDelta == INT64_MAX) {
OS << uint8_t(dwarf::DW_LNS_extended_op);
@@ -386,7 +374,8 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
}
F.setVarContents(Data);
- F.setVarFixups(Fixups);
+ F.setVarFixups({MCFixup::create(Offset, &AddrDelta,
+ MCFixup::getDataKindForSize(PCBytes))});
WasRelaxed = OldSize != Data.size();
return true;
}
@@ -881,9 +870,8 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
}
void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
MCFixupKind Kind = Fixup.getKind();
if (mc::isRelocation(Kind))
@@ -898,15 +886,14 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
-
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index d97d632..adec1ec 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -46,8 +46,7 @@ public:
void maybeAddVendorReloc(const MCFragment &, const MCFixup &);
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index f816561c..98c8738 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -68,27 +68,6 @@ enum Fixups {
fixup_riscv_invalid,
NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind
};
-
-static inline std::pair<MCFixupKind, MCFixupKind>
-getRelocPairForSize(unsigned Size) {
- switch (Size) {
- default:
- llvm_unreachable("unsupported fixup size");
- case 1:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD8,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB8);
- case 2:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD16,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB16);
- case 4:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD32,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB32);
- case 8:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD64,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB64);
- }
-}
-
} // end namespace llvm::RISCV
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f223fdbe..5998653 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2827,6 +2827,8 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
static bool isWorthFoldingAdd(SDValue Add) {
for (auto *User : Add->users()) {
if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE &&
+ User->getOpcode() != RISCVISD::LD_RV32 &&
+ User->getOpcode() != RISCVISD::SD_RV32 &&
User->getOpcode() != ISD::ATOMIC_LOAD &&
User->getOpcode() != ISD::ATOMIC_STORE)
return false;
@@ -2841,6 +2843,9 @@ static bool isWorthFoldingAdd(SDValue Add) {
if (User->getOpcode() == ISD::ATOMIC_STORE &&
cast<AtomicSDNode>(User)->getVal() == Add)
return false;
+ if (User->getOpcode() == RISCVISD::SD_RV32 &&
+ (User->getOperand(0) == Add || User->getOperand(1) == Add))
+ return false;
if (isStrongerThanMonotonic(cast<MemSDNode>(User)->getSuccessOrdering()))
return false;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c0ada51..adbfbeb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1819,6 +1819,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::riscv_seg6_load_mask:
case Intrinsic::riscv_seg7_load_mask:
case Intrinsic::riscv_seg8_load_mask:
+ case Intrinsic::riscv_sseg2_load_mask:
+ case Intrinsic::riscv_sseg3_load_mask:
+ case Intrinsic::riscv_sseg4_load_mask:
+ case Intrinsic::riscv_sseg5_load_mask:
+ case Intrinsic::riscv_sseg6_load_mask:
+ case Intrinsic::riscv_sseg7_load_mask:
+ case Intrinsic::riscv_sseg8_load_mask:
return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
/*IsUnitStrided*/ false, /*UsePtrVal*/ true);
case Intrinsic::riscv_seg2_store_mask:
@@ -10938,6 +10945,97 @@ static inline SDValue getVCIXISDNodeVOID(SDValue &Op, SelectionDAG &DAG,
return DAG.getNode(Type, SDLoc(Op), Op.getValueType(), Operands);
}
+static SDValue
+lowerFixedVectorSegLoadIntrinsics(unsigned IntNo, SDValue Op,
+ const RISCVSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool IsStrided;
+ switch (IntNo) {
+ case Intrinsic::riscv_seg2_load_mask:
+ case Intrinsic::riscv_seg3_load_mask:
+ case Intrinsic::riscv_seg4_load_mask:
+ case Intrinsic::riscv_seg5_load_mask:
+ case Intrinsic::riscv_seg6_load_mask:
+ case Intrinsic::riscv_seg7_load_mask:
+ case Intrinsic::riscv_seg8_load_mask:
+ IsStrided = false;
+ break;
+ case Intrinsic::riscv_sseg2_load_mask:
+ case Intrinsic::riscv_sseg3_load_mask:
+ case Intrinsic::riscv_sseg4_load_mask:
+ case Intrinsic::riscv_sseg5_load_mask:
+ case Intrinsic::riscv_sseg6_load_mask:
+ case Intrinsic::riscv_sseg7_load_mask:
+ case Intrinsic::riscv_sseg8_load_mask:
+ IsStrided = true;
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ };
+
+ static const Intrinsic::ID VlsegInts[7] = {
+ Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+ Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+ Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+ Intrinsic::riscv_vlseg8_mask};
+ static const Intrinsic::ID VlssegInts[7] = {
+ Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask,
+ Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask,
+ Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask,
+ Intrinsic::riscv_vlsseg8_mask};
+
+ SDLoc DL(Op);
+ unsigned NF = Op->getNumValues() - 1;
+ assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT VT = Op->getSimpleValueType(0);
+ MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
+ ContainerVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
+
+ // Operands: (chain, int_id, pointer, mask, vl) or
+ // (chain, int_id, pointer, offset, mask, vl)
+ SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+ SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
+ MVT MaskVT = Mask.getSimpleValueType();
+ MVT MaskContainerVT =
+ ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+ Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+
+ SDValue IntID = DAG.getTargetConstant(
+ IsStrided ? VlssegInts[NF - 2] : VlsegInts[NF - 2], DL, XLenVT);
+ auto *Load = cast<MemIntrinsicSDNode>(Op);
+
+ SDVTList VTs = DAG.getVTList({VecTupTy, MVT::Other});
+ SmallVector<SDValue, 9> Ops = {
+ Load->getChain(),
+ IntID,
+ DAG.getUNDEF(VecTupTy),
+ Op.getOperand(2),
+ Mask,
+ VL,
+ DAG.getTargetConstant(
+ RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
+ DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
+ // Insert the stride operand.
+ if (IsStrided)
+ Ops.insert(std::next(Ops.begin(), 4), Op.getOperand(3));
+
+ SDValue Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+ Load->getMemoryVT(), Load->getMemOperand());
+ SmallVector<SDValue, 9> Results;
+ for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) {
+ SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
+ Result.getValue(0),
+ DAG.getTargetConstant(RetIdx, DL, MVT::i32));
+ Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget));
+ }
+ Results.push_back(Result.getValue(1));
+ return DAG.getMergeValues(Results, DL);
+}
+
SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = Op.getConstantOperandVal(1);
@@ -10950,57 +11048,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::riscv_seg5_load_mask:
case Intrinsic::riscv_seg6_load_mask:
case Intrinsic::riscv_seg7_load_mask:
- case Intrinsic::riscv_seg8_load_mask: {
- SDLoc DL(Op);
- static const Intrinsic::ID VlsegInts[7] = {
- Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
- Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
- Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
- Intrinsic::riscv_vlseg8_mask};
- unsigned NF = Op->getNumValues() - 1;
- assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
- MVT XLenVT = Subtarget.getXLenVT();
- MVT VT = Op->getSimpleValueType(0);
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
- unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
- ContainerVT.getScalarSizeInBits();
- EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
-
- // Operands: (chain, int_id, pointer, mask, vl)
- SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
- SDValue Mask = Op.getOperand(3);
- MVT MaskVT = Mask.getSimpleValueType();
- MVT MaskContainerVT =
- ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
- Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
-
- SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
- auto *Load = cast<MemIntrinsicSDNode>(Op);
+ case Intrinsic::riscv_seg8_load_mask:
+ case Intrinsic::riscv_sseg2_load_mask:
+ case Intrinsic::riscv_sseg3_load_mask:
+ case Intrinsic::riscv_sseg4_load_mask:
+ case Intrinsic::riscv_sseg5_load_mask:
+ case Intrinsic::riscv_sseg6_load_mask:
+ case Intrinsic::riscv_sseg7_load_mask:
+ case Intrinsic::riscv_sseg8_load_mask:
+ return lowerFixedVectorSegLoadIntrinsics(IntNo, Op, Subtarget, DAG);
- SDVTList VTs = DAG.getVTList({VecTupTy, MVT::Other});
- SDValue Ops[] = {
- Load->getChain(),
- IntID,
- DAG.getUNDEF(VecTupTy),
- Op.getOperand(2),
- Mask,
- VL,
- DAG.getTargetConstant(
- RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
- DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
- SDValue Result =
- DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
- Load->getMemoryVT(), Load->getMemOperand());
- SmallVector<SDValue, 9> Results;
- for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) {
- SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
- Result.getValue(0),
- DAG.getTargetConstant(RetIdx, DL, MVT::i32));
- Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget));
- }
- Results.push_back(Result.getValue(1));
- return DAG.getMergeValues(Results, DL);
- }
case Intrinsic::riscv_sf_vc_v_x_se:
return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_X_SE);
case Intrinsic::riscv_sf_vc_v_i_se:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 31ea2de..cc2977c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -910,7 +910,7 @@ foreach vti = AllIntegerVectors in {
foreach vti = I64IntegerVectors in {
let Predicates = [HasVInstructionsI64] in {
def : Pat<(add (vti.Vector vti.RegClass:$rs1),
- (vti.Vector (SplatPat_imm64_neg i64:$rs2))),
+ (vti.Vector (SplatPat_imm64_neg (i64 GPR:$rs2)))),
(!cast<Instruction>("PseudoVSUB_VX_"#vti.LMul.MX)
(vti.Vector (IMPLICIT_DEF)),
vti.RegClass:$rs1,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 695223b..acbccdd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2123,7 +2123,7 @@ foreach vti = AllIntegerVectors in {
foreach vti = I64IntegerVectors in {
let Predicates = [HasVInstructionsI64] in {
def : Pat<(riscv_add_vl (vti.Vector vti.RegClass:$rs1),
- (vti.Vector (SplatPat_imm64_neg i64:$rs2)),
+ (vti.Vector (SplatPat_imm64_neg (i64 GPR:$rs2))),
vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag),
(!cast<Instruction>("PseudoVSUB_VX_"#vti.LMul.MX#"_MASK")
vti.RegClass:$passthru, vti.RegClass:$rs1,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index c0f7ab1..4c31ce4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -590,12 +590,12 @@ let Predicates = [HasVendorXTHeadBb, IsRV64] in {
def : PatGprImm<riscv_rorw, TH_SRRIW, uimm5>;
def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
(TH_SRRIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
-def : Pat<(sra (bswap i64:$rs1), (i64 32)),
- (TH_REVW i64:$rs1)>;
-def : Pat<(binop_allwusers<srl> (bswap i64:$rs1), (i64 32)),
- (TH_REVW i64:$rs1)>;
-def : Pat<(riscv_clzw i64:$rs1),
- (TH_FF0 (i64 (SLLI (i64 (XORI i64:$rs1, -1)), 32)))>;
+def : Pat<(i64 (sra (bswap GPR:$rs1), (i64 32))),
+ (TH_REVW GPR:$rs1)>;
+def : Pat<(binop_allwusers<srl> (bswap GPR:$rs1), (i64 32)),
+ (TH_REVW GPR:$rs1)>;
+def : Pat<(riscv_clzw GPR:$rs1),
+ (TH_FF0 (i64 (SLLI (i64 (XORI GPR:$rs1, -1)), 32)))>;
} // Predicates = [HasVendorXTHeadBb, IsRV64]
let Predicates = [HasVendorXTHeadBs] in {
@@ -697,11 +697,13 @@ def uimm2_4 : Operand<XLenVT>, ImmLeaf<XLenVT, [{
}], uimm2_4_XFORM>;
let Predicates = [HasVendorXTHeadMemPair, IsRV64] in {
-def : Pat<(th_lwud i64:$rs1, uimm2_3:$uimm2_3), (TH_LWUD i64:$rs1, uimm2_3:$uimm2_3, 3)>;
-def : Pat<(th_ldd i64:$rs1, uimm2_4:$uimm2_4), (TH_LDD i64:$rs1, uimm2_4:$uimm2_4, 4)>;
+def : Pat<(th_lwud GPR:$rs1, (i64 uimm2_3:$uimm2_3)),
+ (TH_LWUD GPR:$rs1, uimm2_3:$uimm2_3, 3)>;
+def : Pat<(th_ldd GPR:$rs1, (i64 uimm2_4:$uimm2_4)),
+ (TH_LDD GPR:$rs1, uimm2_4:$uimm2_4, 4)>;
-def : Pat<(th_sdd i64:$rd1, i64:$rd2, i64:$rs1, uimm2_4:$uimm2_4),
- (TH_SDD i64:$rd1, i64:$rd2, i64:$rs1, uimm2_4:$uimm2_4, 4)>;
+def : Pat<(th_sdd (i64 GPR:$rd1), GPR:$rd2, GPR:$rs1, uimm2_4:$uimm2_4),
+ (TH_SDD GPR:$rd1, GPR:$rd2, GPR:$rs1, uimm2_4:$uimm2_4, 4)>;
}
let Predicates = [HasVendorXTHeadMemPair] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index a250ac8..1efe616 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -1128,13 +1128,13 @@ let Predicates = [HasStdExtZvkned] in {
let Predicates = [HasStdExtZvknha] in {
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>;
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>;
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>;
} // Predicates = [HasStdExtZvknha]
let Predicates = [HasStdExtZvknhb] in {
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>;
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32I64IntegerVectors>;
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>;
} // Predicates = [HasStdExtZvknhb]
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
index ef84d43..5710cf2 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
@@ -21,8 +21,7 @@ public:
SPIRVAsmBackend(llvm::endianness Endian) : MCAsmBackend(Endian) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override {}
+ uint8_t *Data, uint64_t Value, bool IsResolved) override {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 6ec7544..25cdf72 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -148,6 +148,7 @@ struct ConvertBuiltin {
bool IsSaturated;
bool IsRounded;
bool IsBfloat16;
+ bool IsTF32;
FPRoundingMode::FPRoundingMode RoundingMode;
};
@@ -230,6 +231,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall,
// - "__spirv_SubgroupImageMediaBlockReadINTEL"
// - "__spirv_SubgroupImageMediaBlockWriteINTEL"
// - "__spirv_Convert"
+ // - "__spirv_Round"
// - "__spirv_UConvert"
// - "__spirv_SConvert"
// - "__spirv_FConvert"
@@ -242,7 +244,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall,
"SDotKHR|SUDotKHR|SDotAccSatKHR|UDotAccSatKHR|SUDotAccSatKHR|"
"ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|"
"SubgroupImageMediaBlockReadINTEL|SubgroupImageMediaBlockWriteINTEL|"
- "Convert|"
+ "Convert|Round|"
"UConvert|SConvert|FConvert|SatConvert)[^_]*)(_R[^_]*_?(\\w+)?.*)?");
std::smatch Match;
if (std::regex_match(BuiltinName, Match, SpvWithR) && Match.size() > 1) {
@@ -697,7 +699,8 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
if (Call->isSpirvOp())
- return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0));
+ return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call,
+ Register(0));
Register ScopeRegister =
buildConstantIntReg32(SPIRV::Scope::Device, MIRBuilder, GR);
@@ -2677,8 +2680,20 @@ static bool generateConvertInst(const StringRef DemangledCall,
}
} else if (GR->isScalarOrVectorOfType(Call->ReturnRegister,
SPIRV::OpTypeFloat)) {
- // Float -> Float
- Opcode = SPIRV::OpFConvert;
+ if (Builtin->IsTF32) {
+ const auto *ST = static_cast<const SPIRVSubtarget *>(
+ &MIRBuilder.getMF().getSubtarget());
+ if (!ST->canUseExtension(
+ SPIRV::Extension::SPV_INTEL_tensor_float32_conversion))
+ NeedExtMsg = "SPV_INTEL_tensor_float32_conversion";
+ IsRightComponentsNumber =
+ GR->getScalarOrVectorComponentCount(Call->Arguments[0]) ==
+ GR->getScalarOrVectorComponentCount(Call->ReturnRegister);
+ Opcode = SPIRV::OpRoundFToTF32INTEL;
+ } else {
+ // Float -> Float
+ Opcode = SPIRV::OpFConvert;
+ }
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index ea78dcd..d08560b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1461,6 +1461,8 @@ class ConvertBuiltin<string name, InstructionSet set> {
bit IsRounded = !not(!eq(!find(name, "_rt"), -1));
bit IsBfloat16 = !or(!not(!eq(!find(name, "BF16"), -1)),
!not(!eq(!find(name, "bfloat16"), -1)));
+ bit IsTF32 = !or(!not(!eq(!find(name, "TF32"), -1)),
+ !not(!eq(!find(name, "tensor_float32"), -1)));
FPRoundingMode RoundingMode = !cond(!not(!eq(!find(name, "_rte"), -1)) : RTE,
!not(!eq(!find(name, "_rtz"), -1)) : RTZ,
!not(!eq(!find(name, "_rtp"), -1)) : RTP,
@@ -1472,7 +1474,7 @@ class ConvertBuiltin<string name, InstructionSet set> {
def ConvertBuiltins : GenericTable {
let FilterClass = "ConvertBuiltin";
let Fields = ["Name", "Set", "IsDestinationSigned", "IsSaturated",
- "IsRounded", "IsBfloat16", "RoundingMode"];
+ "IsRounded", "IsBfloat16", "IsTF32", "RoundingMode"];
string TypeOf_Set = "InstructionSet";
string TypeOf_RoundingMode = "FPRoundingMode";
}
@@ -1556,6 +1558,25 @@ foreach conv = ["FToBF16INTEL", "BF16ToFINTEL"] in {
def : ConvertBuiltin<!strconcat("__spirv_Convert", conv), OpenCL_std>;
}
+// cl_intel_tensor_float32_conversions / SPV_INTEL_tensor_float32_conversion
+// Multiclass used to define at the same time both a demangled builtin record
+// and a corresponding convert builtin record.
+multiclass DemangledTF32RoundBuiltin<string name1, string name2> {
+ // Create records for scalar and vector conversions.
+ foreach i = ["", "2", "3", "4", "8", "16"] in {
+ def : DemangledBuiltin<!strconcat("intel_round_", name1, i, name2, i), OpenCL_std, Convert, 1, 1>;
+ def : ConvertBuiltin<!strconcat("intel_round_", name1, i, name2, i), OpenCL_std>;
+ }
+}
+
+defm : DemangledTF32RoundBuiltin<"tensor_float32", "_as_float">;
+defm : DemangledTF32RoundBuiltin<"as_tensor_float32", "_float">;
+
+foreach conv = ["FToTF32INTEL"] in {
+ def : DemangledBuiltin<!strconcat("__spirv_Round", conv), OpenCL_std, Convert, 1, 1>;
+ def : ConvertBuiltin<!strconcat("__spirv_Round", conv), OpenCL_std>;
+}
+
//===----------------------------------------------------------------------===//
// Class defining a vector data load/store builtin record used for lowering
// into OpExtInst instruction.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 2726203..d9265f4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -102,7 +102,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
SPIRV::Extension::Extension::SPV_INTEL_2d_block_io},
{"SPV_INTEL_int4", SPIRV::Extension::Extension::SPV_INTEL_int4},
{"SPV_KHR_float_controls2",
- SPIRV::Extension::Extension::SPV_KHR_float_controls2}};
+ SPIRV::Extension::Extension::SPV_KHR_float_controls2},
+ {"SPV_INTEL_tensor_float32_conversion",
+ SPIRV::Extension::Extension::SPV_INTEL_tensor_float32_conversion}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 3c631ce..947b574 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -194,6 +194,42 @@ class SPIRVEmitIntrinsics
void useRoundingMode(ConstrainedFPIntrinsic *FPI, IRBuilder<> &B);
+ // Tries to walk the type accessed by the given GEP instruction.
+ // For each nested type access, one of the 2 callbacks is called:
+ // - OnLiteralIndexing when the index is a known constant value.
+ // Parameters:
+ // PointedType: the pointed type resulting of this indexing.
+ // If the parent type is an array, this is the index in the array.
+ // If the parent type is a struct, this is the field index.
+ // Index: index of the element in the parent type.
+ // - OnDynamnicIndexing when the index is a non-constant value.
+ // This callback is only called when indexing into an array.
+ // Parameters:
+ // ElementType: the type of the elements stored in the parent array.
+ // Offset: the Value* containing the byte offset into the array.
+ // Return true if an error occured during the walk, false otherwise.
+ bool walkLogicalAccessChain(
+ GetElementPtrInst &GEP,
+ const std::function<void(Type *PointedType, uint64_t Index)>
+ &OnLiteralIndexing,
+ const std::function<void(Type *ElementType, Value *Offset)>
+ &OnDynamicIndexing);
+
+ // Returns the type accessed using the given GEP instruction by relying
+ // on the GEP type.
+ // FIXME: GEP types are not supposed to be used to retrieve the pointed
+ // type. This must be fixed.
+ Type *getGEPType(GetElementPtrInst *GEP);
+
+ // Returns the type accessed using the given GEP instruction by walking
+ // the source type using the GEP indices.
+ // FIXME: without help from the frontend, this method cannot reliably retrieve
+ // the stored type, nor can robustly determine the depth of the type
+ // we are accessing.
+ Type *getGEPTypeLogical(GetElementPtrInst *GEP);
+
+ Instruction *buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP);
+
public:
static char ID;
SPIRVEmitIntrinsics(SPIRVTargetMachine *TM = nullptr)
@@ -246,6 +282,17 @@ bool expectIgnoredInIRTranslation(const Instruction *I) {
}
}
+// Returns the source pointer from `I` ignoring intermediate ptrcast.
+Value *getPointerRoot(Value *I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::spv_ptrcast) {
+ Value *V = II->getArgOperand(0);
+ return getPointerRoot(V);
+ }
+ }
+ return I;
+}
+
} // namespace
char SPIRVEmitIntrinsics::ID = 0;
@@ -555,7 +602,112 @@ void SPIRVEmitIntrinsics::maybeAssignPtrType(Type *&Ty, Value *Op, Type *RefTy,
Ty = RefTy;
}
-Type *getGEPType(GetElementPtrInst *Ref) {
+bool SPIRVEmitIntrinsics::walkLogicalAccessChain(
+ GetElementPtrInst &GEP,
+ const std::function<void(Type *, uint64_t)> &OnLiteralIndexing,
+ const std::function<void(Type *, Value *)> &OnDynamicIndexing) {
+ // We only rewrite i8* GEP. Other should be left as-is.
+ // Valid i8* GEP must always have a single index.
+ assert(GEP.getSourceElementType() ==
+ IntegerType::getInt8Ty(CurrF->getContext()));
+ assert(GEP.getNumIndices() == 1);
+
+ auto &DL = CurrF->getDataLayout();
+ Value *Src = getPointerRoot(GEP.getPointerOperand());
+ Type *CurType = deduceElementType(Src, true);
+
+ Value *Operand = *GEP.idx_begin();
+ ConstantInt *CI = dyn_cast<ConstantInt>(Operand);
+ if (!CI) {
+ ArrayType *AT = dyn_cast<ArrayType>(CurType);
+ // Operand is not constant. Either we have an array and accept it, or we
+ // give up.
+ if (AT)
+ OnDynamicIndexing(AT->getElementType(), Operand);
+ return AT == nullptr;
+ }
+
+ assert(CI);
+ uint64_t Offset = CI->getZExtValue();
+
+ do {
+ if (ArrayType *AT = dyn_cast<ArrayType>(CurType)) {
+ uint32_t EltTypeSize = DL.getTypeSizeInBits(AT->getElementType()) / 8;
+ assert(Offset < AT->getNumElements() * EltTypeSize);
+ uint64_t Index = Offset / EltTypeSize;
+ Offset = Offset - (Index * EltTypeSize);
+ CurType = AT->getElementType();
+ OnLiteralIndexing(CurType, Index);
+ } else if (StructType *ST = dyn_cast<StructType>(CurType)) {
+ uint32_t StructSize = DL.getTypeSizeInBits(ST) / 8;
+ assert(Offset < StructSize);
+ (void)StructSize;
+ const auto &STL = DL.getStructLayout(ST);
+ unsigned Element = STL->getElementContainingOffset(Offset);
+ Offset -= STL->getElementOffset(Element);
+ CurType = ST->getElementType(Element);
+ OnLiteralIndexing(CurType, Element);
+ } else {
+ // Vector type indexing should not use GEP.
+ // So if we have an index left, something is wrong. Giving up.
+ return true;
+ }
+ } while (Offset > 0);
+
+ return false;
+}
+
+Instruction *
+SPIRVEmitIntrinsics::buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP) {
+ auto &DL = CurrF->getDataLayout();
+ IRBuilder<> B(GEP.getParent());
+ B.SetInsertPoint(&GEP);
+
+ std::vector<Value *> Indices;
+ Indices.push_back(ConstantInt::get(
+ IntegerType::getInt32Ty(CurrF->getContext()), 0, /* Signed= */ false));
+ walkLogicalAccessChain(
+ GEP,
+ [&Indices, &B](Type *EltType, uint64_t Index) {
+ Indices.push_back(
+ ConstantInt::get(B.getInt64Ty(), Index, /* Signed= */ false));
+ },
+ [&Indices, &B, &DL](Type *EltType, Value *Offset) {
+ uint32_t EltTypeSize = DL.getTypeSizeInBits(EltType) / 8;
+ Value *Index = B.CreateUDiv(
+ Offset, ConstantInt::get(Offset->getType(), EltTypeSize,
+ /* Signed= */ false));
+ Indices.push_back(Index);
+ });
+
+ SmallVector<Type *, 2> Types = {GEP.getType(), GEP.getOperand(0)->getType()};
+ SmallVector<Value *, 4> Args;
+ Args.push_back(B.getInt1(GEP.isInBounds()));
+ Args.push_back(GEP.getOperand(0));
+ llvm::append_range(Args, Indices);
+ auto *NewI = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+ replaceAllUsesWithAndErase(B, &GEP, NewI);
+ return NewI;
+}
+
+Type *SPIRVEmitIntrinsics::getGEPTypeLogical(GetElementPtrInst *GEP) {
+
+ Type *CurType = GEP->getResultElementType();
+
+ bool Interrupted = walkLogicalAccessChain(
+ *GEP, [&CurType](Type *EltType, uint64_t Index) { CurType = EltType; },
+ [&CurType](Type *EltType, Value *Index) { CurType = EltType; });
+
+ return Interrupted ? GEP->getResultElementType() : CurType;
+}
+
+Type *SPIRVEmitIntrinsics::getGEPType(GetElementPtrInst *Ref) {
+ if (Ref->getSourceElementType() ==
+ IntegerType::getInt8Ty(CurrF->getContext()) &&
+ TM->getSubtargetImpl()->isLogicalSPIRV()) {
+ return getGEPTypeLogical(Ref);
+ }
+
Type *Ty = nullptr;
// TODO: not sure if GetElementPtrInst::getTypeAtIndex() does anything
// useful here
@@ -1395,6 +1547,13 @@ Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) {
}
Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {
+ if (I.getSourceElementType() == IntegerType::getInt8Ty(CurrF->getContext()) &&
+ TM->getSubtargetImpl()->isLogicalSPIRV()) {
+ Instruction *Result = buildLogicalAccessChainFromGEP(I);
+ if (Result)
+ return Result;
+ }
+
IRBuilder<> B(I.getParent());
B.SetInsertPoint(&I);
SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()};
@@ -1588,7 +1747,24 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
}
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
Value *Pointer = GEPI->getPointerOperand();
- Type *OpTy = GEPI->getSourceElementType();
+ Type *OpTy = nullptr;
+
+ // Knowing the accessed type is mandatory for logical SPIR-V. Sadly,
+ // the GEP source element type should not be used for this purpose, and
+ // the alternative type-scavenging method is not working.
+ // Physical SPIR-V can work around this, but not logical, hence still
+ // try to rely on the broken type scavenging for logical.
+ bool IsRewrittenGEP =
+ GEPI->getSourceElementType() == IntegerType::getInt8Ty(I->getContext());
+ if (IsRewrittenGEP && TM->getSubtargetImpl()->isLogicalSPIRV()) {
+ Value *Src = getPointerRoot(Pointer);
+ OpTy = GR->findDeducedElementType(Src);
+ }
+
+ // In all cases, fall back to the GEP type if type scavenging failed.
+ if (!OpTy)
+ OpTy = GEPI->getSourceElementType();
+
replacePointerOperandWithPtrCast(I, Pointer, OpTy, 0, B);
if (isNestedPointer(OpTy))
insertTodoType(Pointer);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 049ba02..f0b938d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -445,6 +445,9 @@ def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938
def OpConvertFToBF16INTEL : UnOp<"OpConvertFToBF16INTEL", 6116>;
def OpConvertBF16ToFINTEL : UnOp<"OpConvertBF16ToFINTEL", 6117>;
+// SPV_INTEL_tensor_float32_conversion
+def OpRoundFToTF32INTEL : UnOp<"OpRoundFToTF32INTEL", 6426>;
+
// 3.42.12 Composite Instructions
def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index ad976e5..0cd9d78 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1564,6 +1564,13 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::BFloat16ConversionINTEL);
}
break;
+ case SPIRV::OpRoundFToTF32INTEL:
+ if (ST.canUseExtension(
+ SPIRV::Extension::SPV_INTEL_tensor_float32_conversion)) {
+ Reqs.addExtension(SPIRV::Extension::SPV_INTEL_tensor_float32_conversion);
+ Reqs.addCapability(SPIRV::Capability::TensorFloat32RoundingINTEL);
+ }
+ break;
case SPIRV::OpVariableLengthArrayINTEL:
case SPIRV::OpSaveMemoryINTEL:
case SPIRV::OpRestoreMemoryINTEL:
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 548e9b7..614e83a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -320,6 +320,7 @@ defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>;
defm SPV_INTEL_2d_block_io : ExtensionOperand<122>;
defm SPV_INTEL_int4 : ExtensionOperand<123>;
defm SPV_KHR_float_controls2 : ExtensionOperand<124>;
+defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -529,6 +530,7 @@ defm Subgroup2DBlockTransformINTEL : CapabilityOperand<6229, 0, 0, [SPV_INTEL_2d
defm Subgroup2DBlockTransposeINTEL : CapabilityOperand<6230, 0, 0, [SPV_INTEL_2d_block_io], [Subgroup2DBlockIOINTEL]>;
defm Int4TypeINTEL : CapabilityOperand<5112, 0, 0, [SPV_INTEL_int4], []>;
defm Int4CooperativeMatrixINTEL : CapabilityOperand<5114, 0, 0, [SPV_INTEL_int4], [Int4TypeINTEL, CooperativeMatrixKHR]>;
+defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tensor_float32_conversion], []>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index ba023af..bc60842 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -127,8 +127,7 @@ public:
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override {
@@ -253,21 +252,19 @@ MCFixupKindInfo SparcAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void SparcAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
if (!IsResolved)
return;
Value = adjustFixupValue(Fixup.getKind(), Value);
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- unsigned Offset = Fixup.getOffset();
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i;
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index d5f8492..d692cbe 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -113,8 +113,7 @@ public:
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
};
@@ -152,20 +151,18 @@ MCFixupKindInfo SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (Target.getSpecifier())
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
MCFixupKind Kind = Fixup.getKind();
if (mc::isRelocation(Kind))
return;
- unsigned Offset = Fixup.getOffset();
unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
unsigned Size = (BitSize + 7) / 8;
- assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!");
// Big-endian insertion of Size bytes.
Value = extractBitsForFixup(Kind, Value, Fixup, getContext());
@@ -173,7 +170,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value &= ((uint64_t)1 << BitSize) - 1;
unsigned ShiftValue = (Size * 8) - 8;
for (unsigned I = 0; I != Size; ++I) {
- Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+ Data[I] |= uint8_t(Value >> ShiftValue);
ShiftValue -= 8;
}
}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index f987621..c1b9d9f 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -112,8 +112,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char>, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override {
@@ -152,7 +151,7 @@ public:
} // end anonymous namespace
void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *Data,
uint64_t Value, bool IsResolved) {
switch (Fixup.getKind()) {
case VE::fixup_ve_tls_gd_hi32:
@@ -173,14 +172,14 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value <<= Info.TargetOffset;
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits
// from the fixup value. The Value has been "split up" into the
// appropriate bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i;
- Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 837fd8e..eecef31 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -39,7 +39,7 @@ public:
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value, bool) override;
+ uint8_t *Data, uint64_t Value, bool) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -80,8 +80,7 @@ bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
void WebAssemblyAsmBackend::applyFixup(const MCFragment &F,
const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *Data,
uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
@@ -96,13 +95,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned I = 0; I != NumBytes; ++I)
- Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[I] |= uint8_t((Value >> (I * 8)) & 0xff);
}
std::unique_ptr<MCObjectTargetWriter>
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 7f9d474..1f02e56 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -174,8 +174,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override;
@@ -676,9 +675,8 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &,
}
void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// Force relocation when there is a specifier. This might be too conservative
// - GAS doesn't emit a relocation for call local@plt; local:.
if (Target.getSpecifier())
@@ -690,7 +688,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
return;
unsigned Size = getFixupKindSize(Kind);
- assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!");
int64_t SignedValue = static_cast<int64_t>(Value);
if (IsResolved && Fixup.isPCRel()) {
@@ -710,7 +708,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
for (unsigned i = 0; i != Size; ++i)
- Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ Data[i] = uint8_t(Value >> (i * 8));
}
bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bbbb1d9..ce4c061 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23486,7 +23486,6 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
}
// Try to shrink i64 compares if the input has enough zero bits.
- // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
@@ -23496,6 +23495,16 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
}
+ // Try to shrink all i64 compares if the inputs are representable as signed
+ // i32.
+ if (CmpVT == MVT::i64 &&
+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
+ DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
+ }
+
// 0-x == y --> x+y == 0
// 0-x != y --> x+y != 0
if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
index 9167794..08936ad 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
@@ -37,8 +37,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
@@ -153,9 +152,8 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F,
}
void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
MCContext &Ctx = getContext();
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
@@ -168,11 +166,10 @@ void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (!Value)
return; // Doesn't change encoding.
- unsigned Offset = Fixup.getOffset();
unsigned FullSize = getSize(Fixup.getKind());
for (unsigned i = 0; i != FullSize; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 126be71..e5d2e1c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -444,6 +444,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["bitop3-insts"] = true;
Features["prng-inst"] = true;
Features["tanh-insts"] = true;
+ Features["tensor-cvt-lut-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
Features["bf16-cvt-insts"] = true;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 3c24d2e..01da012 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -13404,7 +13404,7 @@ struct AAAllocationInfoImpl : public AAAllocationInfo {
return indicatePessimisticFixpoint();
if (BinSize == 0) {
- auto NewAllocationSize = std::optional<TypeSize>(TypeSize(0, false));
+ auto NewAllocationSize = std::make_optional<TypeSize>(0, false);
if (!changeAllocationSize(NewAllocationSize))
return ChangeStatus::UNCHANGED;
return ChangeStatus::CHANGED;
@@ -13422,8 +13422,7 @@ struct AAAllocationInfoImpl : public AAAllocationInfo {
if (SizeOfBin >= *AllocationSize)
return indicatePessimisticFixpoint();
- auto NewAllocationSize =
- std::optional<TypeSize>(TypeSize(SizeOfBin * 8, false));
+ auto NewAllocationSize = std::make_optional<TypeSize>(SizeOfBin * 8, false);
if (!changeAllocationSize(NewAllocationSize))
return ChangeStatus::UNCHANGED;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index c009c1e..b8c99f1 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -99,6 +99,9 @@ STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
STATISTIC(MismatchedCloneAssignments,
"Number of callsites assigned to call multiple non-matching clones");
+STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
+STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
+STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -109,6 +112,11 @@ static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
cl::Hidden,
cl::desc("Export graph to dot files."));
+// TODO: Remove this option once new handling is validated more widely.
+static cl::opt<bool> DoMergeIteration(
+ "memprof-merge-iteration", cl::init(true), cl::Hidden,
+ cl::desc("Iteratively apply merging on a node to catch new callers"));
+
// How much of the graph to export to dot.
enum DotScope {
All, // The full CCG graph.
@@ -3995,7 +4003,7 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
- auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
+ auto *CurF = getCalleeFunc(CallerCall.call());
auto NewCalleeCloneNo = CalleeFunc.cloneNo();
if (isMemProfClone(*CurF)) {
// If we already assigned this callsite to call a specific non-default
@@ -4191,16 +4199,36 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
if (!Inserted.second)
return;
- // Make a copy since the recursive call may move a caller edge to a new
- // callee, messing up the iterator.
- auto CallerEdges = Node->CallerEdges;
- for (auto CallerEdge : CallerEdges) {
- // Skip any caller edge moved onto a different callee during recursion.
- if (CallerEdge->Callee != Node)
- continue;
- mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
+ // Iteratively perform merging on this node to handle new caller nodes created
+ // during the recursive traversal. We could do something more elegant such as
+ // maintain a worklist, but this is a simple approach that doesn't cause a
+ // measureable compile time effect, as most nodes don't have many caller
+ // edges to check.
+ bool FoundUnvisited = true;
+ unsigned Iters = 0;
+ while (FoundUnvisited) {
+ Iters++;
+ FoundUnvisited = false;
+ // Make a copy since the recursive call may move a caller edge to a new
+ // callee, messing up the iterator.
+ auto CallerEdges = Node->CallerEdges;
+ for (auto CallerEdge : CallerEdges) {
+ // Skip any caller edge moved onto a different callee during recursion.
+ if (CallerEdge->Callee != Node)
+ continue;
+ // If we found an unvisited caller, note that we should check the caller
+ // edges again as mergeClones may add or change caller nodes.
+ if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
+ FoundUnvisited = true;
+ mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
+ }
}
+ TotalMergeInvokes++;
+ TotalMergeIters += Iters;
+ if (Iters > MaxMergeIters)
+ MaxMergeIters = Iters;
+
// Merge for this node after we handle its callers.
mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 00b877b..fe0f308 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -462,6 +462,13 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
return ScalarPHI;
}
+ // If SrcVec is a subvector starting at index 0, extract from the
+ // wider source vector
+ Value *V;
+ if (match(SrcVec,
+ m_Intrinsic<Intrinsic::vector_extract>(m_Value(V), m_Zero())))
+ return ExtractElementInst::Create(V, Index);
+
// TODO come up with a n-ary matcher that subsumes both unary and
// binary matchers.
UnaryOperator *UO;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 56358b1..5ee3bb1 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2011,12 +2011,17 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
if (IdenticalUsers) {
- for (User *U : make_early_inc_range(PN->users())) {
+ // Collect and deduplicate users up-front to avoid iterator invalidation.
+ SmallSetVector<Instruction *, 4> ToReplace;
+ for (User *U : PN->users()) {
Instruction *User = cast<Instruction>(U);
if (User == &I)
continue;
- replaceInstUsesWith(*User, NewPN);
- eraseInstFromFunction(*User);
+ ToReplace.insert(User);
+ }
+ for (Instruction *I : ToReplace) {
+ replaceInstUsesWith(*I, NewPN);
+ eraseInstFromFunction(*I);
}
OneUse = true;
}
@@ -2654,9 +2659,18 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP,
APInt NewOffset = TypeSize * *C2 + *C1;
if (NewOffset.isZero() ||
(Src->hasOneUse() && GEP.getOperand(1)->hasOneUse())) {
+ GEPNoWrapFlags Flags = GEPNoWrapFlags::none();
+ if (GEP.hasNoUnsignedWrap() &&
+ cast<GEPOperator>(Src)->hasNoUnsignedWrap() &&
+ match(GEP.getOperand(1), m_NUWAddLike(m_Value(), m_Value()))) {
+ Flags |= GEPNoWrapFlags::noUnsignedWrap();
+ if (GEP.isInBounds() && cast<GEPOperator>(Src)->isInBounds())
+ Flags |= GEPNoWrapFlags::inBounds();
+ }
+
Value *GEPConst =
- IC.Builder.CreatePtrAdd(Base, IC.Builder.getInt(NewOffset));
- return GetElementPtrInst::Create(BaseType, GEPConst, VarIndex);
+ IC.Builder.CreatePtrAdd(Base, IC.Builder.getInt(NewOffset), "", Flags);
+ return GetElementPtrInst::Create(BaseType, GEPConst, VarIndex, Flags);
}
return nullptr;
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 68094c3..c3f80f9 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2508,6 +2508,12 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
if (!GEP)
return false;
+ // Do not try to hoist a constant GEP out of the loop via reassociation.
+ // Constant GEPs can often be folded into addressing modes, and reassociating
+ // them may inhibit CSE of a common base.
+ if (GEP->hasAllConstantIndices())
+ return false;
+
auto *Src = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
if (!Src || !Src->hasOneUse() || !L.contains(Src))
return false;
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index f3e992c..04039b8 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
// in simplified form, and also needs LCSSA. Running
// this pass will simplify all loops that contain inner loops,
// regardless of whether anything ends up being flattened.
- LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
+ LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
+ &AR.AC);
for (Loop *InnerLoop : LN.getLoops()) {
auto *OuterLoop = InnerLoop->getParentLoop();
if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 4f2bfb0..448dc2b 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
const Function *F = L.getHeader()->getParent();
OptimizationRemarkEmitter ORE(F);
- LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
+ LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC);
if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 320b792..6ffe841 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -79,8 +79,7 @@
// ld.global.f32 %f4, [%rl6+132]; // much better
//
// Another improvement enabled by the LowerGEP flag is to lower a GEP with
-// multiple indices to either multiple GEPs with a single index or arithmetic
-// operations (depending on whether the target uses alias analysis in codegen).
+// multiple indices to multiple GEPs with a single index.
// Such transformation can have following benefits:
// (1) It can always extract constants in the indices of structure type.
// (2) After such Lowering, there are more optimization opportunities such as
@@ -88,59 +87,33 @@
//
// E.g. The following GEPs have multiple indices:
// BB1:
-// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+// %p = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 3
// load %p
// ...
// BB2:
-// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+// %p2 = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 2
// load %p2
// ...
//
// We can not do CSE to the common part related to index "i64 %i". Lowering
// GEPs can achieve such goals.
-// If the target does not use alias analysis in codegen, this pass will
-// lower a GEP with multiple indices into arithmetic operations:
-// BB1:
-// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
-// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %3 = add i64 %1, %2 ; CSE opportunity
-// %4 = mul i64 %j1, length_of_struct
-// %5 = add i64 %3, %4
-// %6 = add i64 %3, struct_field_3 ; Constant offset
-// %p = inttoptr i64 %6 to i32*
-// load %p
-// ...
-// BB2:
-// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
-// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %9 = add i64 %7, %8 ; CSE opportunity
-// %10 = mul i64 %j2, length_of_struct
-// %11 = add i64 %9, %10
-// %12 = add i64 %11, struct_field_2 ; Constant offset
-// %p = inttoptr i64 %12 to i32*
-// load %p2
-// ...
//
-// If the target uses alias analysis in codegen, this pass will lower a GEP
-// with multiple indices into multiple GEPs with a single index:
+// This pass will lower a GEP with multiple indices into multiple GEPs with a
+// single index:
// BB1:
-// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
-// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = getelementptr i8, ptr %ptr, i64 %2 ; CSE opportunity
// %4 = mul i64 %j1, length_of_struct
-// %5 = getelementptr i8* %3, i64 %4
-// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
-// %p = bitcast i8* %6 to i32*
+// %5 = getelementptr i8, ptr %3, i64 %4
+// %p = getelementptr i8, ptr %5, struct_field_3 ; Constant offset
// load %p
// ...
// BB2:
-// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
-// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = getelementptr i8, ptr %ptr, i64 %8 ; CSE opportunity
// %10 = mul i64 %j2, length_of_struct
-// %11 = getelementptr i8* %9, i64 %10
-// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
-// %p2 = bitcast i8* %12 to i32*
+// %11 = getelementptr i8, ptr %9, i64 %10
+// %p2 = getelementptr i8, ptr %11, struct_field_2 ; Constant offset
// load %p2
// ...
//
@@ -408,16 +381,6 @@ private:
void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
int64_t AccumulativeByteOffset);
- /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
- /// Function splitGEP already split the original GEP into a variadic part and
- /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
- /// variadic part into a set of arithmetic operations and applies
- /// AccumulativeByteOffset to it.
- /// \p Variadic The variadic part of the original GEP.
- /// \p AccumulativeByteOffset The constant offset.
- void lowerToArithmetics(GetElementPtrInst *Variadic,
- int64_t AccumulativeByteOffset);
-
/// Finds the constant offset within each index and accumulates them. If
/// LowerGEP is true, it finds in indices of both sequential and structure
/// types, otherwise it only finds in sequential indices. The output
@@ -951,55 +914,6 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
Variadic->eraseFromParent();
}
-void
-SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
- int64_t AccumulativeByteOffset) {
- IRBuilder<> Builder(Variadic);
- Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
- assert(IntPtrTy == DL->getIndexType(Variadic->getType()) &&
- "Pointer type must match index type for arithmetic-based lowering of "
- "split GEPs");
-
- Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
- gep_type_iterator GTI = gep_type_begin(*Variadic);
- // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
- // don't create arithmetics for structure indices, as they are accumulated
- // in the constant offset index.
- for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- Value *Idx = Variadic->getOperand(I);
- // Skip zero indices.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
- if (CI->isZero())
- continue;
-
- APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
- GTI.getSequentialElementStride(*DL));
- // Scale the index by element size.
- if (ElementSize != 1) {
- if (ElementSize.isPowerOf2()) {
- Idx = Builder.CreateShl(
- Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
- } else {
- Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
- }
- }
- // Create an ADD for each index.
- ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
- }
- }
-
- // Create an ADD for the constant offset index.
- if (AccumulativeByteOffset != 0) {
- ResultPtr = Builder.CreateAdd(
- ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
- }
-
- ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
- Variadic->replaceAllUsesWith(ResultPtr);
- Variadic->eraseFromParent();
-}
-
bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
TargetTransformInfo &TTI) {
auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
@@ -1091,8 +1005,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// Notice that we don't remove struct field indices here. If LowerGEP is
// disabled, a structure index is not accumulated and we still use the old
// one. If LowerGEP is enabled, a structure index is accumulated in the
- // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
- // handle the constant offset and won't need a new structure index.
+ // constant offset. LowerToSingleIndexGEPs will later handle the constant
+ // offset and won't need a new structure index.
gep_type_iterator GTI = gep_type_begin(*GEP);
for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
if (GTI.isSequential()) {
@@ -1167,22 +1081,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
GEP->setNoWrapFlags(NewGEPFlags);
- // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+ // Lowers a GEP to GEPs with a single index.
if (LowerGEP) {
- // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
- // arithmetic operations if the target uses alias analysis in codegen.
- // Additionally, pointers that aren't integral (and so can't be safely
- // converted to integers) or those whose offset size is different from their
- // pointer size (which means that doing integer arithmetic on them could
- // affect that data) can't be lowered in this way.
- unsigned AddrSpace = GEP->getPointerAddressSpace();
- bool PointerHasExtraData = DL->getPointerSizeInBits(AddrSpace) !=
- DL->getIndexSizeInBits(AddrSpace);
- if (TTI.useAA() || DL->isNonIntegralAddressSpace(AddrSpace) ||
- PointerHasExtraData)
- lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
- else
- lowerToArithmetics(GEP, AccumulativeByteOffset);
+ lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 969d225..c47fd942 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1665,13 +1665,12 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// Keep a record of all the exiting blocks.
SmallVector<const SCEVPredicate *, 4> Predicates;
- std::optional<std::pair<BasicBlock *, BasicBlock *>> SingleUncountableEdge;
+ BasicBlock *SingleUncountableExitingBlock = nullptr;
for (BasicBlock *BB : ExitingBlocks) {
const SCEV *EC =
PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates);
if (isa<SCEVCouldNotCompute>(EC)) {
- SmallVector<BasicBlock *, 2> Succs(successors(BB));
- if (Succs.size() != 2) {
+ if (size(successors(BB)) != 2) {
reportVectorizationFailure(
"Early exiting block does not have exactly two successors",
"Incorrect number of successors from early exiting block",
@@ -1679,15 +1678,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
return false;
}
- BasicBlock *ExitBlock;
- if (!TheLoop->contains(Succs[0]))
- ExitBlock = Succs[0];
- else {
- assert(!TheLoop->contains(Succs[1]));
- ExitBlock = Succs[1];
- }
-
- if (SingleUncountableEdge) {
+ if (SingleUncountableExitingBlock) {
reportVectorizationFailure(
"Loop has too many uncountable exits",
"Cannot vectorize early exit loop with more than one early exit",
@@ -1695,7 +1686,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
return false;
}
- SingleUncountableEdge = {BB, ExitBlock};
+ SingleUncountableExitingBlock = BB;
} else
CountableExitingBlocks.push_back(BB);
}
@@ -1705,7 +1696,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// PSE.getSymbolicMaxBackedgeTakenCount() below.
Predicates.clear();
- if (!SingleUncountableEdge) {
+ if (!SingleUncountableExitingBlock) {
LLVM_DEBUG(dbgs() << "LV: Cound not find any uncountable exits");
return false;
}
@@ -1713,7 +1704,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// The only supported early exit loops so far are ones where the early
// exiting block is a unique predecessor of the latch block.
BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
- if (LatchPredBB != SingleUncountableEdge->first) {
+ if (LatchPredBB != SingleUncountableExitingBlock) {
reportVectorizationFailure("Early exit is not the latch predecessor",
"Cannot vectorize early exit loop",
"EarlyExitNotLatchPredecessor", ORE, TheLoop);
@@ -1766,7 +1757,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
// The vectoriser cannot handle loads that occur after the early exit block.
- assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first &&
+ assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
"Expected latch predecessor to be the early exiting block");
// TODO: Handle loops that may fault.
@@ -1789,7 +1780,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
"backedge taken count: "
<< *SymbolicMaxBTC << '\n');
- UncountableEdge = SingleUncountableEdge;
+ UncountableExitingBB = SingleUncountableExitingBlock;
return true;
}
@@ -1861,7 +1852,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
} else {
if (!isVectorizableEarlyExitLoop()) {
- UncountableEdge = std::nullopt;
+ assert(!hasUncountableEarlyExit() &&
+ "Must be false without vectorizable early-exit loop");
if (DoExtraAnalysis)
Result = false;
else
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 850c4a1..b4ea70e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1624,7 +1624,7 @@ private:
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
- DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
+ MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
@@ -9788,6 +9788,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
match(
P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
m_SpecificInt(0)) &&
+ any_of(P.incoming_values(),
+ [&EPI](Value *Inc) {
+ return Inc == EPI.VectorTripCount;
+ }) &&
all_of(P.incoming_values(), [&EPI](Value *Inc) {
return Inc == EPI.VectorTripCount ||
match(Inc, m_SpecificInt(0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index b77aa9d..c79485c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -231,6 +231,13 @@ vp_post_order_shallow(VPBlockBase *G) {
}
/// Returns an iterator range to traverse the graph starting at \p G in
+/// post order while traversing through region blocks.
+inline iterator_range<po_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>>
+vp_post_order_deep(VPBlockBase *G) {
+ return post_order(VPBlockDeepTraversalWrapper<VPBlockBase *>(G));
+}
+
+/// Returns an iterator range to traverse the graph starting at \p G in
/// depth-first order while traversing through region blocks.
inline iterator_range<df_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>>
vp_depth_first_deep(VPBlockBase *G) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d249a34..98d11f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2836,12 +2836,12 @@ static void scalarizeInstruction(const Instruction *Instr,
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy) {
Cloned->setName(Instr->getName() + ".cloned");
-#if !defined(NDEBUG)
- // Verify that VPlan type inference results agree with the type of the
- // generated values.
- assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
- "inferred type and type from generated instructions do not match");
-#endif
+ Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
+ // The operands of the replicate recipe may have been narrowed, resulting in
+ // a narrower result type. Update the type of the cloned instruction to the
+ // correct type.
+ if (ResultTy != Cloned->getType())
+ Cloned->mutateType(ResultTy);
}
RepRecipe->applyFlags(*Cloned);
@@ -3548,6 +3548,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
+ assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ "Mismatch between NeedsMaskForGaps and MaskForGaps");
assert((!MaskForGaps || !State.VF.isScalable()) &&
"masking gaps for scalable vectors is not yet supported.");
ArrayRef<VPValue *> StoredValues = getStoredValues();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 18d331b..373b2f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -545,10 +545,8 @@ static bool isDeadRecipe(VPRecipeBase &R) {
}
void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getEntry());
-
- for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_post_order_deep(Plan.getEntry()))) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
@@ -1413,7 +1411,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
ElementCount BestVF, unsigned BestUF,
ScalarEvolution &SE) {
- if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
+ if (match(Cond, m_BinaryOr(m_VPValue(), m_VPValue())))
return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
&SE](VPValue *C) {
return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
@@ -1431,15 +1429,15 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
// count is not conveniently available as SCEV so far, so we compare directly
// against the original trip count. This is stricter than necessary, as we
// will only return true if the trip count == vector trip count.
- // TODO: Use SCEV for vector trip count once available, to cover cases where
- // vector trip count == UF * VF, but original trip count != UF * VF.
- const SCEV *TripCount =
- vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TripCount) &&
+ const SCEV *VectorTripCount =
+ vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE);
+ if (isa<SCEVCouldNotCompute>(VectorTripCount))
+ VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
"Trip count SCEV must be computable");
ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
- const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
- return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
+ const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
+ return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
}
/// Try to simplify the branch condition of \p Plan. This may restrict the
@@ -2563,7 +2561,8 @@ void VPlanTransforms::createInterleaveGroups(
}
bool NeedsMaskForGaps =
- IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
+ (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
+ (!StoredValues.empty() && !IG->isFull());
Instruction *IRInsertPos = IG->getInsertPos();
auto *InsertPos =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 81bd21b..14f20c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -73,8 +73,11 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
}
const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
- if (V->isLiveIn())
- return SE.getSCEV(V->getLiveInIRValue());
+ if (V->isLiveIn()) {
+ if (Value *LiveIn = V->getLiveInIRValue())
+ return SE.getSCEV(LiveIn);
+ return SE.getCouldNotCompute();
+ }
// TODO: Support constructing SCEVs for more recipes as needed.
return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())