aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp172
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h7
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp38
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td140
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp149
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.h6
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp81
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp64
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp6
22 files changed, 421 insertions, 346 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2987468..40e6400 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -96,7 +97,6 @@
#include <cctype>
#include <cstdint>
#include <cstdlib>
-#include <deque>
#include <iterator>
#include <limits>
#include <optional>
@@ -105,7 +105,6 @@
#include <vector>
using namespace llvm;
-using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
@@ -1175,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+ setTargetDAGCombine(ISD::CTPOP);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -11331,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
break;
}
+ // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
+ // prefer using SVE if available.
if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(
- VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+ useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
switch (Opcode) {
default:
llvm_unreachable("Wrong instruction");
@@ -17555,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
// udot instruction.
if (SrcWidth * 4 <= DstWidth) {
if (all_of(I->users(), [&](auto *U) {
+ using namespace llvm::PatternMatch;
auto *SingleUser = cast<Instruction>(&*U);
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
return true;
@@ -17826,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// into shift / and masks. For the moment we do this just for uitofp (not
// zext) to avoid issues with widening instructions.
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+ using namespace llvm::PatternMatch;
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
SI->getType()->getScalarSizeInBits() * 4 ==
SI->user_back()->getType()->getScalarSizeInBits();
@@ -17990,17 +17993,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
unsigned Factor,
const APInt &GapMask) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
-
- if (isProfitableToInterleaveWithGatherScatter() &&
- Factor > getMaxSupportedInterleaveFactor())
- return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
-
- assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
- "Invalid interleave factor");
-
assert(!LaneMask && GapMask.popcount() == Factor &&
"Unexpected mask on store");
@@ -18146,126 +18143,6 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}
-/// If the interleaved vector elements are greater than supported MaxFactor,
-/// interleaving the data with additional shuffles can be used to
-/// achieve the same.
-///
-/// Consider the following data with 8 interleaves which are shuffled to store
-/// stN instructions. Data needs to be stored in this order:
-/// [v0, v1, v2, v3, v4, v5, v6, v7]
-///
-/// v0 v4 v2 v6 v1 v5 v3 v7
-/// | | | | | | | |
-/// \ / \ / \ / \ /
-/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4
-/// | | | |
-/// \ / \ /
-/// \ / \ /
-/// \ / \ /
-/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
-///
-/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
-/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
-/// another st4.
-///
-/// For stN = 2, upper half of interleaved data V0, V1 is stored
-/// with one st2 instruction. Second set V2, V3 is stored with another st2.
-/// Total of 4 st2's are required here.
-bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
- StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
- unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
-
- auto *VecTy = cast<FixedVectorType>(SVI->getType());
- assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
-
- unsigned LaneLen = VecTy->getNumElements() / Factor;
- Type *EltTy = VecTy->getElementType();
- auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
-
- const DataLayout &DL = SI->getModule()->getDataLayout();
- bool UseScalable;
-
- // Skip if we do not have NEON and skip illegal vector types. We can
- // "legalize" wide vector types into multiple interleaved accesses as long as
- // the vector types are divisible by 128.
- if (!Subtarget->hasNEON() ||
- !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
- return false;
-
- if (UseScalable)
- return false;
-
- std::deque<Value *> Shuffles;
- Shuffles.push_back(SVI);
- unsigned ConcatLevel = Factor;
- // Getting all the interleaved operands.
- while (ConcatLevel > 1) {
- unsigned InterleavedOperands = Shuffles.size();
- for (unsigned i = 0; i < InterleavedOperands; i++) {
- ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
- if (!SFL)
- return false;
- Shuffles.pop_front();
-
- Value *Op0 = SFL->getOperand(0);
- Value *Op1 = SFL->getOperand(1);
-
- Shuffles.push_back(dyn_cast<Value>(Op0));
- Shuffles.push_back(dyn_cast<Value>(Op1));
- }
- ConcatLevel >>= 1;
- }
-
- IRBuilder<> Builder(SI);
- auto Mask = createInterleaveMask(LaneLen, 2);
- SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
- for (unsigned i = 0; i < LaneLen; i++) {
- LowerHalfMask[i] = Mask[i];
- UpperHalfMask[i] = Mask[i + LaneLen];
- }
-
- unsigned InterleaveFactor = Factor >> 1;
- while (InterleaveFactor >= MaxSupportedFactor) {
- std::deque<Value *> ShufflesIntermediate;
- ShufflesIntermediate.resize(Factor);
- for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
- for (unsigned i = 0; i < InterleaveFactor; i++) {
- auto *Shuffle = Builder.CreateShuffleVector(
- Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
- ShufflesIntermediate[i + j] = Shuffle;
- Shuffle = Builder.CreateShuffleVector(
- Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
- ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
- }
- }
- Shuffles = ShufflesIntermediate;
- InterleaveFactor >>= 1;
- }
-
- Type *PtrTy = SI->getPointerOperandType();
- auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
-
- Value *BaseAddr = SI->getPointerOperand();
- Function *StNFunc = getStructuredStoreFunction(
- SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
- for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
- SmallVector<Value *, 5> Ops;
- for (unsigned j = 0; j < MaxSupportedFactor; j++)
- Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
-
- if (i > 0) {
- // We will compute the pointer operand of each store from the original
- // base address using GEPs. Cast the base address to a pointer to the
- // scalar element type.
- BaseAddr = Builder.CreateConstGEP1_32(
- SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
- }
- Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
- Builder.CreateCall(StNFunc, Ops);
- }
- return true;
-}
-
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
@@ -27968,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
{A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
}
+static SDValue performCTPOPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ using namespace llvm::SDPatternMatch;
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+ SDValue Mask;
+ if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ EVT MaskVT = Mask.getValueType();
+
+ if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+ MaskVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ EVT ReduceInVT =
+ EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+ SDLoc DL(N);
+ // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+ SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+ SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+ return DAG.getNegative(NegPopCount, DL, VT);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -28313,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performScalarToVectorCombine(N, DCI, DAG);
case ISD::SHL:
return performSHLCombine(N, DCI, DAG);
+ case ISD::CTPOP:
+ return performCTPOPCombine(N, DCI, DAG);
}
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfd8474..70bfae7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,10 +229,6 @@ public:
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
- bool isProfitableToInterleaveWithGatherScatter() const override {
- return true;
- }
-
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -243,9 +239,6 @@ public:
ShuffleVectorInst *SVI, unsigned Factor,
const APInt &GapMask) const override;
- bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
- unsigned Factor) const;
-
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8729ed3..197aae6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,36 +4922,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- unsigned NumLoadStores = 1;
- InstructionCost ShuffleCost = 0;
- bool isInterleaveWithShuffle = false;
- unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
-
- auto *SubVecTy =
- VectorType::get(VecVTy->getElementType(),
- VecVTy->getElementCount().divideCoefficientBy(Factor));
-
- if (TLI->isProfitableToInterleaveWithGatherScatter() &&
- Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
- Factor > MaxSupportedFactor) {
- isInterleaveWithShuffle = true;
- SmallVector<int, 16> Mask;
- // preparing interleave Mask.
- for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
- i++) {
- for (unsigned j = 0; j < 2; j++)
- Mask.push_back(j * Factor + i);
- }
-
- NumLoadStores = Factor / MaxSupportedFactor;
- ShuffleCost =
- (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
- Mask, CostKind, 0, SubVecTy));
- }
-
- if (!UseMaskForGaps &&
- (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
+ if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
+ auto *SubVecTy =
+ VectorType::get(VecVTy->getElementType(),
+ VecVTy->getElementCount().divideCoefficientBy(Factor));
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -4959,10 +4934,7 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
bool UseScalable;
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
- return (Factor *
- TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
- NumLoadStores) +
- ShuffleCost;
+ return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e3b0a1b..e62fdb6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -312,7 +312,7 @@ public:
}
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
- if (!ST->hasSVE())
+ if (!ST->isSVEorStreamingSVEAvailable())
return false;
// For fixed vectors, avoid scalarization if using SVE for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4fe194c..54d94b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,18 +2366,6 @@ def isGFX8GFX9NotGFX90A :
" Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
-// Pre-90A GFX9s allow the NV bit in FLAT instructions.
-def isNVAllowedInFlat :
- Predicate<"!Subtarget->hasGFX90AInsts() &&"
- " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
- AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
-
-// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
-def isNVNotAllowedInFlat :
- Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
- " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
- AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
-
def isGFX90AOnly :
Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2808c44..09338c5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,11 +1602,6 @@ public:
bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
- bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
- uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
- return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
- }
-
AMDGPUTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5375,7 +5370,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
Error(S, "scale_offset is not supported on this GPU");
}
- if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
+ if (CPol & CPol::NV) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7150,13 +7145,6 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
unsigned Enabled = 0, Seen = 0;
for (;;) {
SMLoc S = getLoc();
-
- if (isGFX9() && trySkipId("nv")) {
- Enabled |= CPol::NV;
- Seen |= CPol::NV;
- continue;
- }
-
bool Disabling;
unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6ef2241..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
bits<7> saddr;
bits<10> vdst;
- bits<6> cpol;
+ bits<5> cpol;
// Only valid on gfx9
bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2693,52 +2693,29 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
!subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
}
-class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
- FLAT_Real_vi <op, ps, has_sccb> {
- let AssemblerPredicate = isNVNotAllowedInFlat;
-}
-
-class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
- FLAT_Real_vi <op, ps, has_sccb> {
- let AssemblerPredicate = isNVAllowedInFlat;
- let Subtarget = SIEncodingFamily.GFX9;
- let DecoderNamespace = "GFX9";
- let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
-}
-
-multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
- def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
- def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
-}
-
multiclass FLAT_Real_AllAddr_vi<bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
- defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
- defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
-}
-
-multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
- bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
- def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
- def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
let AssemblerPredicate = isGFX940Plus;
- let DecoderNamespace = "GFX940";
+ let DecoderNamespace = "GFX9";
let Inst{13} = ps.sve;
let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
}
multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
- let OtherPredicates = [isGFX8GFX9NotGFX940] in {
- defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX940;
+ let OtherPredicates = [isGFX8GFX9NotGFX940];
+ }
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
+ let DecoderNamespace = "GFX9";
}
-
- defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
-
let AssemblerPredicate = isGFX940Plus in {
def _VE_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2751,11 +2728,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
let OtherPredicates = [isGFX8GFX9NotGFX940] in {
- let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
- defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
}
- let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
- defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+ def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
+ let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
}
}
@@ -2771,66 +2748,47 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
}
-defm FLAT_LOAD_UBYTE_vi : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
-defm FLAT_LOAD_SBYTE_vi : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
-defm FLAT_LOAD_USHORT_vi : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
-defm FLAT_LOAD_SSHORT_vi : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
-defm FLAT_LOAD_DWORD_vi : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
-defm FLAT_LOAD_DWORDX2_vi : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
-defm FLAT_LOAD_DWORDX4_vi : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
-defm FLAT_LOAD_DWORDX3_vi : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-defm FLAT_STORE_BYTE_vi : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
-defm FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-defm FLAT_STORE_SHORT_vi : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
-defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-defm FLAT_STORE_DWORD_vi : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
-defm FLAT_STORE_DWORDX2_vi : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
-defm FLAT_STORE_DWORDX4_vi : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
-defm FLAT_STORE_DWORDX3_vi : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-defm FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-defm FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-defm FLAT_LOAD_SHORT_D16_vi : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
-defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+def FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+def FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
+def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
multiclass FLAT_Real_Atomics_vi <bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
defvar ps = !cast<FLAT_Pseudo>(NAME);
- defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
- defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
- def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
- bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
- defvar ps = !cast<FLAT_Pseudo>(NAME);
- def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
- def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-
- def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+ def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+ def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
}
multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
FLAT_Real_AllAddr_vi<op, has_sccb> {
- defm _RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
- defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
-
- def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
- def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
- bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
- FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
- def _RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
- def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+ def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+ def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
- def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
- def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+ def _RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+ def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
}
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>;
@@ -2992,10 +2950,10 @@ let AssemblerPredicate = isGFX940Plus in {
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>;
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>;
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>;
- defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
- defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
- defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
- defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d>;
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e>;
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52>;
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
} // End AssemblerPredicate = isGFX940Plus
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 3e6f35d..703ec0a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,12 +186,8 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " dlc";
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
- if (Imm & ~CPol::ALL_pregfx12) {
- if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
- O << " nv";
- else
- O << " /* unexpected cache policy bit */";
- }
+ if (Imm & ~CPol::ALL_pregfx12)
+ O << " /* unexpected cache policy bit */";
}
void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index c89212d..90a4723 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
return ArrayRef(TargetFlags);
}
+bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+ Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const {
+ enum MemIOffsetType {
+ Imm14Shift2,
+ Imm12,
+ Imm11Shift1,
+ Imm10Shift2,
+ Imm9Shift3,
+ Imm8,
+ Imm8Shift1,
+ Imm8Shift2,
+ Imm8Shift3
+ };
+
+ MemIOffsetType OT;
+ switch (MemI.getOpcode()) {
+ default:
+ return false;
+ case LoongArch::LDPTR_W:
+ case LoongArch::LDPTR_D:
+ case LoongArch::STPTR_W:
+ case LoongArch::STPTR_D:
+ OT = Imm14Shift2;
+ break;
+ case LoongArch::LD_B:
+ case LoongArch::LD_H:
+ case LoongArch::LD_W:
+ case LoongArch::LD_D:
+ case LoongArch::LD_BU:
+ case LoongArch::LD_HU:
+ case LoongArch::LD_WU:
+ case LoongArch::ST_B:
+ case LoongArch::ST_H:
+ case LoongArch::ST_W:
+ case LoongArch::ST_D:
+ case LoongArch::FLD_S:
+ case LoongArch::FLD_D:
+ case LoongArch::FST_S:
+ case LoongArch::FST_D:
+ case LoongArch::VLD:
+ case LoongArch::VST:
+ case LoongArch::XVLD:
+ case LoongArch::XVST:
+ case LoongArch::VLDREPL_B:
+ case LoongArch::XVLDREPL_B:
+ OT = Imm12;
+ break;
+ case LoongArch::VLDREPL_H:
+ case LoongArch::XVLDREPL_H:
+ OT = Imm11Shift1;
+ break;
+ case LoongArch::VLDREPL_W:
+ case LoongArch::XVLDREPL_W:
+ OT = Imm10Shift2;
+ break;
+ case LoongArch::VLDREPL_D:
+ case LoongArch::XVLDREPL_D:
+ OT = Imm9Shift3;
+ break;
+ case LoongArch::VSTELM_B:
+ case LoongArch::XVSTELM_B:
+ OT = Imm8;
+ break;
+ case LoongArch::VSTELM_H:
+ case LoongArch::XVSTELM_H:
+ OT = Imm8Shift1;
+ break;
+ case LoongArch::VSTELM_W:
+ case LoongArch::XVSTELM_W:
+ OT = Imm8Shift2;
+ break;
+ case LoongArch::VSTELM_D:
+ case LoongArch::XVSTELM_D:
+ OT = Imm8Shift3;
+ break;
+ }
+
+ if (MemI.getOperand(0).getReg() == Reg)
+ return false;
+
+ if ((AddrI.getOpcode() != LoongArch::ADDI_W &&
+ AddrI.getOpcode() != LoongArch::ADDI_D) ||
+ !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm())
+ return false;
+
+ int64_t OldOffset = MemI.getOperand(2).getImm();
+ int64_t Disp = AddrI.getOperand(2).getImm();
+ int64_t NewOffset = OldOffset + Disp;
+ if (!STI.is64Bit())
+ NewOffset = SignExtend64<32>(NewOffset);
+
+ if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) &&
+ !(OT == Imm12 && isInt<12>(NewOffset)) &&
+ !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) &&
+ !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) &&
+ !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) &&
+ !(OT == Imm8 && isInt<8>(NewOffset)) &&
+ !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) &&
+ !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) &&
+ !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset)))
+ return false;
+
+ AM.BaseReg = AddrI.getOperand(1).getReg();
+ AM.ScaledReg = 0;
+ AM.Scale = 0;
+ AM.Displacement = NewOffset;
+ AM.Form = ExtAddrMode::Formula::Basic;
+ return true;
+}
+
+MachineInstr *
+LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const {
+ const DebugLoc &DL = MemI.getDebugLoc();
+ MachineBasicBlock &MBB = *MemI.getParent();
+
+ assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+ "Addressing mode not supported for folding");
+
+ unsigned MemIOp = MemI.getOpcode();
+ switch (MemIOp) {
+ default:
+ return BuildMI(MBB, MemI, DL, get(MemIOp))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ case LoongArch::VSTELM_B:
+ case LoongArch::VSTELM_H:
+ case LoongArch::VSTELM_W:
+ case LoongArch::VSTELM_D:
+ case LoongArch::XVSTELM_B:
+ case LoongArch::XVSTELM_H:
+ case LoongArch::XVSTELM_W:
+ case LoongArch::XVSTELM_D:
+ return BuildMI(MBB, MemI, DL, get(MemIOp))
+ .addReg(MemI.getOperand(0).getReg(), 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement)
+ .addImm(MemI.getOperand(3).getImm())
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ }
+}
+
// Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
bool LoongArch::isSEXT_W(const MachineInstr &MI) {
return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f25958a..f69a558 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -93,6 +93,12 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const override;
+ bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const override;
+ MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const override;
+
protected:
const LoongArchSubtarget &STI;
};
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 9de4c9d..92a9388 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
cl::desc("Enable the merge base offset pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableSinkFold("loongarch-enable-sink-fold",
+ cl::desc("Enable sinking and folding of instruction copies"),
+ cl::init(true), cl::Hidden);
+
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
return RM.value_or(Reloc::Static);
}
@@ -146,7 +151,9 @@ namespace {
class LoongArchPassConfig : public TargetPassConfig {
public:
LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ setEnableSinkAndFold(EnableSinkFold);
+ }
LoongArchTargetMachine &getLoongArchTargetMachine() const {
return getTM<LoongArchTargetMachine>();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c3f100e..995ae75 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16496,32 +16496,42 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
}
static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
- unsigned ShY) {
+ unsigned ShY, bool AddX) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue X = N->getOperand(0);
SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getTargetConstant(ShY, DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getTargetConstant(ShX, DL, VT), Mul359);
+ DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359);
}
static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt) {
+ // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
switch (MulAmt) {
case 5 * 3:
- return getShlAddShlAdd(N, DAG, 2, 1);
+ return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false);
case 9 * 3:
- return getShlAddShlAdd(N, DAG, 3, 1);
+ return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false);
case 5 * 5:
- return getShlAddShlAdd(N, DAG, 2, 2);
+ return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false);
case 9 * 5:
- return getShlAddShlAdd(N, DAG, 3, 2);
+ return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false);
case 9 * 9:
- return getShlAddShlAdd(N, DAG, 3, 3);
+ return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false);
default:
- return SDValue();
+ break;
}
+
+ // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
+ int ShX;
+ if (int ShY = isShifted359(MulAmt - 1, ShX)) {
+ assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+ if (ShX <= 3)
+ return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true);
+ }
+ return SDValue();
}
// Try to expand a scalar multiply to a faster sequence.
@@ -16581,41 +16591,30 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(Shift, DL, VT));
}
- // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
- if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
- return V;
+ // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+ // of 25 which happen to be quite common.
+ // (2/4/8 * 3/5/9 + 1) * 2^N
+ Shift = llvm::countr_zero(MulAmt);
+ if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+ if (Shift == 0)
+ return V;
+ SDLoc DL(N);
+ return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
+ }
// If this is a power 2 + 2/4/8, we can use a shift followed by a single
// shXadd. First check if this a sum of two power of 2s because that's
// easy. Then count how many zeros are up to the first bit.
- if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
- unsigned ScaleShift = llvm::countr_zero(MulAmt);
- if (ScaleShift >= 1 && ScaleShift < 4) {
- unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
- SDLoc DL(N);
- SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getTargetConstant(ScaleShift, DL, VT), Shift1);
- }
+ if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+ unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getTargetConstant(Shift, DL, VT), Shift1);
}
- // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
- // This is the two instruction form, there are also three instruction
- // variants we could implement. e.g.
- // (2^(1,2,3) * 3,5,9 + 1) << C2
- // 2^(C1>3) * 3,5,9 +/- 1
- if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
- assert(Shift != 0 && "MulAmt=4,6,10 handled before");
- if (Shift <= 3) {
- SDLoc DL(N);
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getTargetConstant(ShXAmount, DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getTargetConstant(Shift, DL, VT), X);
- }
- }
+ // TODO: 2^(C1>3) * 3,5,9 +/- 1
// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16647,14 +16646,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
}
}
-
- // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
- // of 25 which happen to be quite common.
- Shift = llvm::countr_zero(MulAmt);
- if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
- SDLoc DL(N);
- return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
- }
}
if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 636e31c..bf9de0a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
if (!TII->isAddImmediate(*DeadMI, Reg))
continue;
LIS->RemoveMachineInstrFromMaps(*DeadMI);
+ Register AddReg = DeadMI->getOperand(1).getReg();
DeadMI->eraseFromParent();
+ if (AddReg.isVirtual())
+ LIS->shrinkToUses(&LIS->getInterval(AddReg));
}
}
}
@@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
// Loop over the dead AVL values, and delete them now. This has
// to be outside the above loop to avoid invalidating iterators.
for (auto *MI : ToDelete) {
+ assert(MI->getOpcode() == RISCV::ADDI);
+ Register AddReg = MI->getOperand(1).getReg();
if (LIS) {
LIS->removeInterval(MI->getOperand(0).getReg());
LIS->RemoveMachineInstrFromMaps(*MI);
}
MI->eraseFromParent();
+ if (LIS && AddReg.isVirtual())
+ LIS->shrinkToUses(&LIS->getInterval(AddReg));
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb..b2cbdb2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
}
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+ MachineIRBuilder &MIRBuilder,
+ SPIRVGlobalRegistry *GR) {
+ const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+ unsigned Opcode =
+ SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+ return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
static bool
generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
return generatePipeInst(Call.get(), MIRBuilder, GR);
case SPIRV::PredicatedLoadStore:
return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+ case SPIRV::BlockingPipes:
+ return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
}
return false;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259cce..492a98e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
def Block2DLoadStore : BuiltinGroup;
def Pipe : BuiltinGroup;
def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
//===----------------------------------------------------------------------===//
// Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 43b2869..f681b0d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -159,7 +159,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
{"SPV_KHR_maximal_reconvergence",
SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
{"SPV_INTEL_kernel_attributes",
- SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
+ SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+ {"SPV_ALTERA_blocking_pipes",
+ SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351e..03bd61b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
"$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
"OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+ "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+ "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index e5ac76c4..af76016 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1885,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(
SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
break;
+ case SPIRV::OpReadPipeBlockingALTERA:
+ case SPIRV::OpWritePipeBlockingALTERA:
+ if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+ Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+ Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+ }
+ break;
case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb..be88f33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ public:
}
};
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+ "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+ cl::desc("Emit unknown intrinsics as calls to external functions. A "
+ "comma-separated input list of intrinsic prefixes must be "
+ "provided, and only intrinsics carrying a listed prefix get "
+ "emitted as described."),
+ cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
} // namespace
char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
EraseFromParent);
Changed = true;
break;
+ default:
+ if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+ any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+ if (Prefix.empty())
+ return false;
+ return II->getCalledFunction()->getName().starts_with(Prefix);
+ }))
+ Changed |= lowerIntrinsicToFunction(II);
+ break;
}
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 1b4b29b..65a8885 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -611,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index af32298..fc6c290 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
// into conversion ops
setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
- ISD::FP_ROUND, ISD::CONCAT_VECTORS});
+ ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND,
+ ISD::CONCAT_VECTORS});
setTargetDAGCombine(ISD::TRUNCATE);
@@ -3580,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N,
}
}
+SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems,
+ SelectionDAG &DAG) {
+ SDLoc DL(In);
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = InVT.getVectorNumElements() * 2;
+ EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems);
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT));
+ if (NumElems < RequiredNumElems) {
+ return DoubleVectorWidth(Concat, RequiredNumElems, DAG);
+ }
+ return Concat;
+}
+
+SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ EVT OutElTy = OutVT.getVectorElementType();
+ if (OutElTy != MVT::i8 && OutElTy != MVT::i16)
+ return SDValue();
+
+ unsigned NumElems = OutVT.getVectorNumElements();
+ if (!isPowerOf2_32(NumElems))
+ return SDValue();
+
+ EVT FPVT = N->getOperand(0)->getValueType(0);
+ if (FPVT.getVectorElementType() != MVT::f32)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // First, convert to i32.
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems);
+ SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0));
+ APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ // Mask out the top MSBs.
+ SDValue Masked =
+ DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT));
+
+ if (OutVT.getSizeInBits() < 128) {
+ // Create a wide enough vector that we can use narrow.
+ EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+ unsigned NumRequiredElems = NarrowedVT.getVectorNumElements();
+ SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG);
+ SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG);
+ return DAG.getBitcast(
+ OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits()));
+ } else {
+ return truncateVectorWithNARROW(OutVT, Masked, DL, DAG);
+ }
+ return SDValue();
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3606,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP_ROUND:
case ISD::CONCAT_VECTORS:
return performVectorTruncZeroCombine(N, DCI);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return performConvertFPCombine(N, DCI.DAG);
case ISD::TRUNCATE:
return performTruncateCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d44227b3..168e041 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53442,7 +53442,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
}
SDValue NewStore =
- DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+ DAG.getStore(St->getChain(), DL, Res, NewPtr,
+ MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
Align(), St->getMemOperand()->getFlags());
// If there are other uses of StoredVal, replace with a new load of the
@@ -54639,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue NewPtr = DAG.getMemBasePlusOffset(
Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
SDValue NewLoad =
- DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+ MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
Align(), Ld->getMemOperand()->getFlags());
DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return NewLoad;