aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp131
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp38
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td140
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp29
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp7
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp66
-rw-r--r--llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td31
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td3
-rw-r--r--llvm/lib/Target/RISCV/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/RISCV/RISCV.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp213
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/Sparc/Sparc.td12
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp8
-rw-r--r--llvm/lib/Target/Target.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp20
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp17
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp21
32 files changed, 699 insertions, 130 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d08f9b9..2987468 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -96,6 +96,7 @@
#include <cctype>
#include <cstdint>
#include <cstdlib>
+#include <deque>
#include <iterator>
#include <limits>
#include <optional>
@@ -17989,11 +17990,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
unsigned Factor,
const APInt &GapMask) const {
- assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
- "Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
+
+ if (isProfitableToInterleaveWithGatherScatter() &&
+ Factor > getMaxSupportedInterleaveFactor())
+ return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
assert(!LaneMask && GapMask.popcount() == Factor &&
"Unexpected mask on store");
@@ -18139,6 +18146,126 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}
+/// If the interleaved vector elements are greater than supported MaxFactor,
+/// interleaving the data with additional shuffles can be used to
+/// achieve the same.
+///
+/// Consider the following data with 8 interleaves which are shuffled to store
+/// stN instructions. Data needs to be stored in this order:
+/// [v0, v1, v2, v3, v4, v5, v6, v7]
+///
+/// v0 v4 v2 v6 v1 v5 v3 v7
+/// | | | | | | | |
+/// \ / \ / \ / \ /
+/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4
+/// | | | |
+/// \ / \ /
+/// \ / \ /
+/// \ / \ /
+/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
+///
+/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
+/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
+/// another st4.
+///
+/// For stN = 2, upper half of interleaved data V0, V1 is stored
+/// with one st2 instruction. Second set V2, V3 is stored with another st2.
+/// Total of 4 st2's are required here.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+ StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+ unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ bool UseScalable;
+
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() ||
+ !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+ return false;
+
+ if (UseScalable)
+ return false;
+
+ std::deque<Value *> Shuffles;
+ Shuffles.push_back(SVI);
+ unsigned ConcatLevel = Factor;
+ // Getting all the interleaved operands.
+ while (ConcatLevel > 1) {
+ unsigned InterleavedOperands = Shuffles.size();
+ for (unsigned i = 0; i < InterleavedOperands; i++) {
+ ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
+ if (!SFL)
+ return false;
+ Shuffles.pop_front();
+
+ Value *Op0 = SFL->getOperand(0);
+ Value *Op1 = SFL->getOperand(1);
+
+ Shuffles.push_back(dyn_cast<Value>(Op0));
+ Shuffles.push_back(dyn_cast<Value>(Op1));
+ }
+ ConcatLevel >>= 1;
+ }
+
+ IRBuilder<> Builder(SI);
+ auto Mask = createInterleaveMask(LaneLen, 2);
+ SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
+ for (unsigned i = 0; i < LaneLen; i++) {
+ LowerHalfMask[i] = Mask[i];
+ UpperHalfMask[i] = Mask[i + LaneLen];
+ }
+
+ unsigned InterleaveFactor = Factor >> 1;
+ while (InterleaveFactor >= MaxSupportedFactor) {
+ std::deque<Value *> ShufflesIntermediate;
+ ShufflesIntermediate.resize(Factor);
+ for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ auto *Shuffle = Builder.CreateShuffleVector(
+ Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
+ ShufflesIntermediate[i + j] = Shuffle;
+ Shuffle = Builder.CreateShuffleVector(
+ Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
+ ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
+ }
+ }
+ Shuffles = ShufflesIntermediate;
+ InterleaveFactor >>= 1;
+ }
+
+ Type *PtrTy = SI->getPointerOperandType();
+ auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+ Value *BaseAddr = SI->getPointerOperand();
+ Function *StNFunc = getStructuredStoreFunction(
+ SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+ for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+ SmallVector<Value *, 5> Ops;
+ for (unsigned j = 0; j < MaxSupportedFactor; j++)
+ Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
+
+ if (i > 0) {
+ // We will compute the pointer operand of each store from the original
+ // base address using GEPs. Cast the base address to a pointer to the
+ // scalar element type.
+ BaseAddr = Builder.CreateConstGEP1_32(
+ SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+ }
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ }
+ return true;
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae7..bfd8474 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ public:
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
+ bool isProfitableToInterleaveWithGatherScatter() const override {
+ return true;
+ }
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ public:
ShuffleVectorInst *SVI, unsigned Factor,
const APInt &GapMask) const override;
+ bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const;
+
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03..52b216c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_PMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src1, type1:$src2);
+ let hasSideEffects = 0;
+}
+
def G_UADDLP : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src1);
@@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
def : GINodeEquiv<G_BSP, AArch64bsp>;
+def : GINodeEquiv<G_PMULL, AArch64pmull>;
def : GINodeEquiv<G_UMULL, AArch64umull>;
def : GINodeEquiv<G_SMULL, AArch64smull>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6..8729ed3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,11 +4922,36 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ unsigned NumLoadStores = 1;
+ InstructionCost ShuffleCost = 0;
+ bool isInterleaveWithShuffle = false;
+ unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+ auto *SubVecTy =
+ VectorType::get(VecVTy->getElementType(),
+ VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+ if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+ Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+ Factor > MaxSupportedFactor) {
+ isInterleaveWithShuffle = true;
+ SmallVector<int, 16> Mask;
+ // preparing interleave Mask.
+ for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+ i++) {
+ for (unsigned j = 0; j < 2; j++)
+ Mask.push_back(j * Factor + i);
+ }
+
+ NumLoadStores = Factor / MaxSupportedFactor;
+ ShuffleCost =
+ (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+ Mask, CostKind, 0, SubVecTy));
+ }
+
+ if (!UseMaskForGaps &&
+ (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
- auto *SubVecTy =
- VectorType::get(VecVTy->getElementType(),
- VecVTy->getElementCount().divideCoefficientBy(Factor));
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -4934,7 +4959,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
bool UseScalable;
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+ return (Factor *
+ TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+ NumLoadStores) +
+ ShuffleCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847..038ad77 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_FMAXNUM);
case Intrinsic::aarch64_neon_fminnm:
return LowerBinOp(TargetOpcode::G_FMINNUM);
+ case Intrinsic::aarch64_neon_pmull:
+ case Intrinsic::aarch64_neon_pmull64:
+ return LowerBinOp(AArch64::G_PMULL);
case Intrinsic::aarch64_neon_smull:
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6d2d705..6b920f0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
case TargetOpcode::G_FCMP:
case TargetOpcode::G_LROUND:
case TargetOpcode::G_LLROUND:
+ case AArch64::G_PMULL:
return true;
case TargetOpcode::G_INTRINSIC:
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1..4fe194c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,6 +2366,18 @@ def isGFX8GFX9NotGFX90A :
" Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+// Pre-90A GFX9s allow the NV bit in FLAT instructions.
+def isNVAllowedInFlat :
+ Predicate<"!Subtarget->hasGFX90AInsts() &&"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
+
+// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
+def isNVNotAllowedInFlat :
+ Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
+ " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
+ AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
+
def isGFX90AOnly :
Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a59132..fdff21b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
} else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
- ConditionalTemps.push_back(RsrcInst);
- RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ // Guard against conditionals that were already folded away.
+ if (RsrcInst != *MaybeRsrc) {
+ ConditionalTemps.push_back(RsrcInst);
+ RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+ }
}
for (Value *V : Seen)
FoundRsrcs[V] = *MaybeRsrc;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 09338c5..2808c44 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,6 +1602,11 @@ public:
bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
+ bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
+ }
+
AMDGPUTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5370,7 +5375,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
Error(S, "scale_offset is not supported on this GPU");
}
- if (CPol & CPol::NV) {
+ if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7145,6 +7150,13 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
unsigned Enabled = 0, Seen = 0;
for (;;) {
SMLoc S = getLoc();
+
+ if (isGFX9() && trySkipId("nv")) {
+ Enabled |= CPol::NV;
+ Seen |= CPol::NV;
+ continue;
+ }
+
bool Disabling;
unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ea64d1..6ef2241 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
bits<7> saddr;
bits<10> vdst;
- bits<5> cpol;
+ bits<6> cpol;
// Only valid on gfx9
bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2693,29 +2693,52 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
!subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
}
+class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+ FLAT_Real_vi <op, ps, has_sccb> {
+ let AssemblerPredicate = isNVNotAllowedInFlat;
+}
+
+class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+ FLAT_Real_vi <op, ps, has_sccb> {
+ let AssemblerPredicate = isNVAllowedInFlat;
+ let Subtarget = SIEncodingFamily.GFX9;
+ let DecoderNamespace = "GFX9";
+ let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
+}
+
+multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
+ def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
+ def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
+}
+
multiclass FLAT_Real_AllAddr_vi<bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
- def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+ defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+}
+
+multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
let AssemblerPredicate = isGFX940Plus;
- let DecoderNamespace = "GFX9";
+ let DecoderNamespace = "GFX940";
let Inst{13} = ps.sve;
let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
}
multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
- let AssemblerPredicate = isGFX8GFX9NotGFX940;
- let OtherPredicates = [isGFX8GFX9NotGFX940];
- }
- def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
- let DecoderNamespace = "GFX9";
+ let OtherPredicates = [isGFX8GFX9NotGFX940] in {
+ defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
}
+
+ defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+
let AssemblerPredicate = isGFX940Plus in {
def _VE_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2728,11 +2751,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
let OtherPredicates = [isGFX8GFX9NotGFX940] in {
- def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
- let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
+ defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
}
- def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
- let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
+ defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
}
@@ -2748,47 +2771,66 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
}
-def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
-def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
-def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
-def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
-def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
-def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
-def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
-def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
-def FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
-def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
-def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
-def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
-def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-def FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-def FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
-def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+defm FLAT_LOAD_UBYTE_vi : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
+defm FLAT_LOAD_SBYTE_vi : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
+defm FLAT_LOAD_USHORT_vi : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
+defm FLAT_LOAD_SSHORT_vi : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
+defm FLAT_LOAD_DWORD_vi : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
+defm FLAT_LOAD_DWORDX2_vi : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
+defm FLAT_LOAD_DWORDX4_vi : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
+defm FLAT_LOAD_DWORDX3_vi : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+defm FLAT_STORE_BYTE_vi : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
+defm FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+defm FLAT_STORE_SHORT_vi : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
+defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+defm FLAT_STORE_DWORD_vi : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
+defm FLAT_STORE_DWORDX2_vi : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
+defm FLAT_STORE_DWORDX4_vi : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
+defm FLAT_STORE_DWORDX3_vi : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+defm FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+defm FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+defm FLAT_LOAD_SHORT_D16_vi : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
+defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
multiclass FLAT_Real_Atomics_vi <bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
defvar ps = !cast<FLAT_Pseudo>(NAME);
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
- def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
- def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+ defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+ defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+ def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ defvar ps = !cast<FLAT_Pseudo>(NAME);
+ def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+ def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+
+ def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
}
multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
FLAT_Real_AllAddr_vi<op, has_sccb> {
- def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
- def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+ defm _RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+ defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+
+ def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+ def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
+ FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
+ def _RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+ def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
- def _RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
- def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+ def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+ def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
}
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>;
@@ -2950,10 +2992,10 @@ let AssemblerPredicate = isGFX940Plus in {
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>;
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>;
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>;
- defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d>;
- defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e>;
- defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52>;
- defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
} // End AssemblerPredicate = isGFX940Plus
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a..3e6f35d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,8 +186,12 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " dlc";
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
- if (Imm & ~CPol::ALL_pregfx12)
- O << " /* unexpected cache policy bit */";
+ if (Imm & ~CPol::ALL_pregfx12) {
+ if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
+ O << " nv";
+ else
+ O << " /* unexpected cache policy bit */";
+ }
}
void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30..84984a0 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1129,40 +1129,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return false;
- MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
return true;
}
- // TODO: Verify the following code handles subregisters correctly.
- // TODO: Handle extract of global reference
- if (UseOp.getSubReg())
- return false;
-
- if (!OpToFold.isReg())
- return false;
-
- Register UseReg = OpToFold.getReg();
- if (!UseReg.isVirtual())
- return false;
-
- // Maybe it is just a COPY of an immediate itself.
-
- // FIXME: Remove this handling. There is already special case folding of
- // immediate into copy in foldOperand. This is looking for the def of the
- // value the folding started from in the first place.
- MachineInstr *Def = MRI->getVRegDef(UseReg);
- if (Def && TII->isFoldableCopy(*Def)) {
- MachineOperand &DefOp = Def->getOperand(1);
- if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
- FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
- OpToFold.DefSubReg);
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
- return true;
- }
- }
-
return false;
}
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index eb4c884..677203d 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -285,6 +285,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+ if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+ PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+ PSV.BaseData.MaximumWaveLaneCount =
+ MMI.EntryPropertyVec[0].WaveSizeMax
+ ? MMI.EntryPropertyVec[0].WaveSizeMax
+ : MMI.EntryPropertyVec[0].WaveSizeMin;
+ }
break;
default:
break;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833..e1a472f 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
ASStateTag,
WaveSize,
EntryRootSig,
+ WaveRange = 23,
};
} // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
case EntryPropsTag::ASStateTag:
case EntryPropsTag::WaveSize:
case EntryPropsTag::EntryRootSig:
+ case EntryPropsTag::WaveRange:
llvm_unreachable("NYI: Unhandled entry property tag");
}
return MDVals;
}
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
- const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+ uint64_t EntryShaderFlags,
+ const ModuleMetadataInfo &MMDI) {
SmallVector<Metadata *> MDVals;
LLVMContext &Ctx = EP.Entry->getContext();
if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
// FIXME: support more props.
// See https://github.com/llvm/llvm-project/issues/57948.
// Add shader kind for lib entries.
- if (ShaderProfile == Triple::EnvironmentType::Library &&
+ if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
EP.ShaderStage != Triple::EnvironmentType::Library)
MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
getShaderStage(EP.ShaderStage), Ctx));
if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+ // Handle mandatory "hlsl.numthreads"
MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
ConstantAsMetadata::get(ConstantInt::get(
Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+ // Handle optional "hlsl.wavesize". The fields are optionally represented
+ // if they are non-zero.
+ if (EP.WaveSizeMin != 0) {
+ bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+ bool IsWaveSize =
+ !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+ if (!IsWaveRange && !IsWaveSize) {
+ reportError(M, "Shader model 6.6 or greater is required to specify "
+ "the \"hlsl.wavesize\" function attribute");
+ return nullptr;
+ }
+
+ // A range is being specified if EP.WaveSizeMax != 0
+ if (EP.WaveSizeMax && !IsWaveRange) {
+ reportError(
+ M, "Shader model 6.8 or greater is required to specify "
+ "wave size range values of the \"hlsl.wavesize\" function "
+ "attribute");
+ return nullptr;
+ }
+
+ EntryPropsTag Tag =
+ IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+ MDVals.emplace_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+ SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+ if (IsWaveRange) {
+ WaveSizeVals.push_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+ WaveSizeVals.push_back(ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+ }
+
+ MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+ }
}
}
+
if (MDVals.empty())
return nullptr;
return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
return MDNode::get(Ctx, MDVals);
}
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
- MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+ MDTuple *Signatures, MDNode *MDResources,
const uint64_t EntryShaderFlags,
- const Triple::EnvironmentType ShaderProfile) {
- MDTuple *Properties =
- getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+ const ModuleMetadataInfo &MMDI) {
+ MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
EP.Entry->getContext());
}
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
"'"));
}
-
- EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
- EntryShaderFlags,
- MMDI.ShaderProfile));
+ EntryFnMDNodes.emplace_back(emitEntryMD(
+ M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
}
NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index 479ac90..f29a739 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -104,13 +104,6 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
{Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
} // namespace
-namespace llvm {
-
-FunctionPass *createHexagonQFPOptimizer();
-void initializeHexagonQFPOptimizerPass(PassRegistry &);
-
-} // namespace llvm
-
namespace {
struct HexagonQFPOptimizer : public MachineFunctionPass {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe700e1..cf4ffc82 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
N->getOperand(1));
break;
+ case Intrinsic::loongarch_lasx_concat_128_s:
+ case Intrinsic::loongarch_lasx_concat_128_d:
+ case Intrinsic::loongarch_lasx_concat_128:
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
}
return SDValue();
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index b502b056..00d5287 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>;
defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>;
defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>;
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+ (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+ (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+ (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+ (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
} // Predicates = [HasExtLASX]
/// Intrinsic pattern
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e..dfbbba0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -420,6 +420,9 @@ let Predicates = [HasVSX, IsISAFuture] in {
: VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
"vucmprlh $VRT, $VRA, $VRB", []>;
+ def XVRLW: XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvrlw $XT, $XA, $XB", []>;
+
// AES Acceleration Instructions
def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp),
(ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M),
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 0ff178e..e9088a4 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
RISCVMoveMerger.cpp
RISCVOptWInstrs.cpp
RISCVPostRAExpandPseudoInsts.cpp
+ RISCVPromoteConstant.cpp
RISCVPushPopOptimizer.cpp
RISCVRedundantCopyElimination.cpp
RISCVRegisterInfo.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ae94101..51e8e85 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -20,6 +20,7 @@
namespace llvm {
class FunctionPass;
class InstructionSelector;
+class ModulePass;
class PassRegistry;
class RISCVRegisterBankInfo;
class RISCVSubtarget;
@@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
FunctionPass *createRISCVPreLegalizerCombiner();
void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+ModulePass *createRISCVPromoteConstantPass();
+void initializeRISCVPromoteConstantPass(PassRegistry &);
+
FunctionPass *createRISCVVLOptimizerPass();
void initializeRISCVVLOptimizerPass(PassRegistry &);
diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
new file mode 100644
index 0000000..bf1f69f
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
@@ -0,0 +1,213 @@
+//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-promote-const"
+#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants"
+
+STATISTIC(NumPromoted, "Number of constant literals promoted to globals");
+STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants");
+
+namespace {
+
+class RISCVPromoteConstant : public ModulePass {
+public:
+ static char ID;
+ RISCVPromoteConstant() : ModulePass(ID) {}
+
+ StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ }
+
+ /// Iterate over the functions and promote the double fp constants that
+ /// would otherwise go into the constant pool to a constant array.
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ // TargetMachine and Subtarget are needed to query isFPImmlegal.
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ bool Changed = false;
+ for (Function &F : M) {
+ const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F);
+ const RISCVTargetLowering *TLI = ST.getTargetLowering();
+ Changed |= runOnFunction(F, TLI);
+ }
+ return Changed;
+ }
+
+private:
+ bool runOnFunction(Function &F, const RISCVTargetLowering *TLI);
+};
+} // end anonymous namespace
+
+char RISCVPromoteConstant::ID = 0;
+
+INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME,
+ false, false)
+
+ModulePass *llvm::createRISCVPromoteConstantPass() {
+ return new RISCVPromoteConstant();
+}
+
+bool RISCVPromoteConstant::runOnFunction(Function &F,
+ const RISCVTargetLowering *TLI) {
+ if (F.hasOptNone() || F.hasOptSize())
+ return false;
+
+ // Bail out and make no transformation if the target doesn't support
+ // doubles, or if we're not targeting RV64 as we currently see some
+ // regressions for those targets.
+ if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64))
+ return false;
+
+ // Collect all unique double constants and their uses in the function. Use
+ // MapVector to preserve insertion order.
+ MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap;
+
+ for (Instruction &I : instructions(F)) {
+ for (Use &U : I.operands()) {
+ auto *C = dyn_cast<ConstantFP>(U.get());
+ if (!C || !C->getType()->isDoubleTy())
+ continue;
+ // Do not promote if it wouldn't be loaded from the constant pool.
+ if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64,
+ /*ForCodeSize=*/false))
+ continue;
+ // Do not promote a constant if it is used as an immediate argument
+ // for an intrinsic.
+ if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+ Function *IntrinsicFunc = II->getFunction();
+ unsigned OperandIdx = U.getOperandNo();
+ if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr(
+ OperandIdx, Attribute::ImmArg)) {
+ LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II
+ << " because operand " << OperandIdx
+ << " must be an immediate.\n");
+ continue;
+ }
+ }
+ // Note: FP args to inline asm would be problematic if we had a
+ // constraint that required an immediate floating point operand. At the
+ // time of writing LLVM doesn't recognise such a constraint.
+ ConstUsesMap[C].push_back(&U);
+ }
+ }
+
+ int PromotableConstants = ConstUsesMap.size();
+ LLVM_DEBUG(dbgs() << "Found " << PromotableConstants
+ << " promotable constants in " << F.getName() << "\n");
+ // Bail out if no promotable constants found, or if only one is found.
+ if (PromotableConstants < 2) {
+ LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable "
+ "constants found\n");
+ return false;
+ }
+
+ NumPromoted += PromotableConstants;
+
+ // Create a global array containing the promoted constants.
+ Module *M = F.getParent();
+ Type *DoubleTy = Type::getDoubleTy(M->getContext());
+
+ SmallVector<Constant *, 16> ConstantVector;
+ for (auto const &Pair : ConstUsesMap)
+ ConstantVector.push_back(Pair.first);
+
+ ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size());
+ Constant *GlobalArrayInitializer =
+ ConstantArray::get(ArrayTy, ConstantVector);
+
+ auto *GlobalArray = new GlobalVariable(
+ *M, ArrayTy,
+ /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer,
+ ".promoted_doubles." + F.getName());
+
+ // A cache to hold the loaded value for a given constant within a basic block.
+ DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads;
+
+ // Replace all uses with the loaded value.
+ unsigned Idx = 0;
+ for (auto const &Pair : ConstUsesMap) {
+ ConstantFP *Const = Pair.first;
+ const SmallVector<Use *, 8> &Uses = Pair.second;
+
+ for (Use *U : Uses) {
+ Instruction *UserInst = cast<Instruction>(U->getUser());
+ BasicBlock *InsertionBB;
+
+ // If the user is a PHI node, we must insert the load in the
+ // corresponding predecessor basic block. Otherwise, it's inserted into
+ // the same block as the use.
+ if (auto *PN = dyn_cast<PHINode>(UserInst))
+ InsertionBB = PN->getIncomingBlock(*U);
+ else
+ InsertionBB = UserInst->getParent();
+
+ if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) {
+ LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid "
+ "insertion point.\n");
+ return false;
+ }
+
+ auto CacheKey = std::make_pair(Const, InsertionBB);
+ Value *LoadedVal = nullptr;
+
+ // Re-use a load if it exists in the insertion block.
+ if (LocalLoads.count(CacheKey)) {
+ LoadedVal = LocalLoads.at(CacheKey);
+ } else {
+ // Otherwise, create a new GEP and Load at the correct insertion point.
+ // It is always safe to insert in the first insertion point in the BB,
+ // so do that and let other passes reorder.
+ IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt());
+ Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64(
+ GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr");
+ LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val");
+
+ // Cache the newly created load for this block.
+ LocalLoads[CacheKey] = LoadedVal;
+ }
+
+ U->set(LoadedVal);
+ ++NumPromotedUses;
+ }
+ ++Idx;
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae54ff1..16ef67d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
initializeRISCVExpandAtomicPseudoPass(*PR);
initializeRISCVRedundantCopyEliminationPass(*PR);
initializeRISCVAsmPrinterPass(*PR);
+ initializeRISCVPromoteConstantPass(*PR);
}
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() {
}
bool RISCVPassConfig::addPreISel() {
+ if (TM->getOptLevel() != CodeGenOptLevel::None)
+ addPass(createRISCVPromoteConstantPass());
if (TM->getOptLevel() != CodeGenOptLevel::None) {
// Add a barrier before instruction selection so that we will not get
// deleted block address after enabling default outlining. See D99707 for
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7137e5f..38b0508 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -95,6 +95,9 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
"rd %pc, %XX is slow", [FeatureV9]>;
+def TuneNoPredictor : SubtargetFeature<"no-predictor", "HasNoPredictor", "true",
+ "Processor has no branch predictor, branches stall execution", []>;
+
//==== Features added predmoninantly for LEON subtarget support
include "LeonFeatures.td"
@@ -174,12 +177,15 @@ def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
FeatureVIS2],
[TuneSlowRDPC]>;
def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
- FeatureVIS2, FeatureUA2005]>;
+ FeatureVIS2, FeatureUA2005],
+ [TuneNoPredictor]>;
def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc,
- FeatureVIS, FeatureVIS2, FeatureUA2005]>;
+ FeatureVIS, FeatureVIS2, FeatureUA2005],
+ [TuneNoPredictor]>;
def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureVIS, FeatureVIS2, FeatureVIS3,
- FeatureUA2005, FeatureUA2007]>;
+ FeatureUA2005, FeatureUA2007],
+ [TuneNoPredictor]>;
def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureVIS, FeatureVIS2, FeatureVIS3,
FeatureUA2005, FeatureUA2007, FeatureOSA2011,
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index cbb7db6..ae3c326 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2000,6 +2000,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ // Some processors have no branch predictor and have pipelines longer than
+ // what can be covered by the delay slot. This results in a stall, so mark
+ // branches to be expensive on those processors.
+ setJumpIsExpensive(Subtarget->hasNoPredictor());
+ // The high cost of branching means that using conditional moves will
+ // still be profitable even if the condition is predictable.
+ PredictableSelectIsExpensive = !isJumpExpensive();
+
setMinFunctionAlignment(Align(4));
computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp
index ec673ef..7387571 100644
--- a/llvm/lib/Target/Target.cpp
+++ b/llvm/lib/Target/Target.cpp
@@ -37,6 +37,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
void llvm::initializeTarget(PassRegistry &Registry) {
initializeTargetLibraryInfoWrapperPassPass(Registry);
+ initializeRuntimeLibraryInfoWrapperPass(Registry);
initializeTargetTransformInfoWrapperPassPass(Registry);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index fc82e5b..304c4f3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -41,6 +41,11 @@ defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
"ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
Requires<[HasGC]>;
+defm REF_FUNC : I<(outs FUNCREF:$res), (ins function32_op:$func),
+ (outs), (ins function32_op:$func), [],
+ "ref.func\t$func", "ref.func $func", 0xd2>,
+ Requires<[HasReferenceTypes]>;
+
defm "" : REF_I<FUNCREF, funcref, "func">;
defm "" : REF_I<EXTERNREF, externref, "extern">;
defm "" : REF_I<EXNREF, exnref, "exn">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 45b0e7d..f3c236c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -532,13 +532,19 @@ struct StaticLibcallNameMap {
// FIXME: This is broken if there are ever different triples compiled with
// different libcalls.
RTLIB::RuntimeLibcallsInfo RTCI(TT);
- for (RTLIB::Libcall LC : RTLIB::libcalls()) {
- StringRef NameLibcall = RTCI.getLibcallName(LC);
- if (!NameLibcall.empty() &&
- getRuntimeLibcallSignatures().Table[LC] != unsupported) {
- assert(!Map.contains(NameLibcall) &&
- "duplicate libcall names in name map");
- Map[NameLibcall] = LC;
+
+ ArrayRef<RuntimeLibcallSignature> Table =
+ getRuntimeLibcallSignatures().Table;
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (!RTCI.isAvailable(Impl))
+ continue;
+ RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+ if (Table[LC] != unsupported) {
+ StringRef NameLibcall =
+ RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl);
+ // FIXME: Map should be to LibcallImpl
+ if (!Map.insert({NameLibcall, LC}).second)
+ llvm_unreachable("duplicate libcall names in name map");
}
}
}
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d4418c8..6c16fcfb 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4728,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
- SDValue InnerOp = Op->getOperand(0);
+ SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
- if (!getFoldableLogicOp(InnerOp))
+ if (!InnerOp)
return SDValue();
N0 = InnerOp.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 06b8f7614..4d44227b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53370,8 +53370,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
//
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
SDValue SrcVal, InsertBit, ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Value(SrcVal),
+ if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
@@ -53442,8 +53441,18 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
}
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
+ SDValue NewStore =
+ DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+ Align(), St->getMemOperand()->getFlags());
+
+ // If there are other uses of StoredVal, replace with a new load of the
+ // whole (updated) value.
+ if (!StoredVal.hasOneUse()) {
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+ }
+ return NewStore;
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index bd4d4eb..5977a27 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
case Xtensa::SSIP:
case Xtensa::LSI:
case Xtensa::LSIP:
-
+ case Xtensa::S32C1I:
if (Res & 0x3) {
report_fatal_error("Unexpected operand value!");
}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 4e73070..8d0fd07 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
return FeatureBits[Xtensa::FeatureWindowed];
case Xtensa::ATOMCTL:
case Xtensa::SCOMPARE1:
- return FeatureBits[Xtensa::FeatureWindowed];
+ return FeatureBits[Xtensa::FeatureS32C1I];
case Xtensa::NoRegister:
return false;
}
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index b0f924f..be69cef 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -114,14 +114,31 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, Register DestReg,
Register SrcReg, bool KillSrc,
bool RenamableDest, bool RenamableSrc) const {
- // The MOV instruction is not present in core ISA,
+ unsigned Opcode;
+
+ // The MOV instruction is not present in core ISA for AR registers,
// so use OR instruction.
- if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
+ if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) {
BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+ Xtensa::FPRRegClass.contains(DestReg))
+ Opcode = Xtensa::MOV_S;
+ else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+ Xtensa::ARRegClass.contains(DestReg))
+ Opcode = Xtensa::RFR;
+ else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) &&
+ Xtensa::FPRRegClass.contains(DestReg))
+ Opcode = Xtensa::WFR;
else
report_fatal_error("Impossible reg-to-reg copy");
+
+ BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
void XtensaInstrInfo::storeRegToStackSlot(