aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp18
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp45
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h10
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp265
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp83
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td504
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp508
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td14
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp10
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp36
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp9
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp10
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td93
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td58
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp244
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td23
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td18
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp27
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h13
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp346
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td532
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp13
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td27
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp2
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.cpp31
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td5
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h2
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp15
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp7
49 files changed, 2239 insertions, 858 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 7de66cc..12fc976 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
llvm_unreachable("Unsupported ElementSize");
}
+ // Preserve undef state until DOP's reg is defined.
+ unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0;
+
//
// Create the destructive operation (if required)
//
@@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
.addReg(DstReg, RegState::Define)
.addReg(MI.getOperand(PredIdx).getReg())
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
// After the movprfx, the destructive operand is same as Dst
DOPIdx = 0;
+ DOPRegState = 0;
// Create the additional LSL to zero the lanes when the DstReg is not
// unique. Zeros the lanes in z0 that aren't active in p0 with sequence
@@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
assert(DOPRegIsUnique && "The destructive operand should be unique");
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
.addReg(DstReg, RegState::Define)
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
DOPIdx = 0;
+ DOPRegState = 0;
}
//
@@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
//
DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+ DOPRegState = DOPRegState | RegState::Kill;
switch (DType) {
case AArch64::DestructiveUnaryPassthru:
- DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(PredIdx))
.add(MI.getOperand(SrcIdx));
break;
@@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
- .add(MI.getOperand(SrcIdx));
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+ .add(MI.getOperand(SrcIdx));
break;
case AArch64::DestructiveTernaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(SrcIdx))
.add(MI.getOperand(Src2Idx));
break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4f13a14..d04e6c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17486,9 +17491,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleavedValues) const {
- unsigned Factor = DeinterleavedValues.size();
+ Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
@@ -17498,9 +17502,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
return false;
assert(!Mask && "Unexpected mask on a load\n");
- Value *FirstActive = *llvm::find_if(DeinterleavedValues,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ VectorType *VTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
@@ -17528,6 +17530,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
+ Value *Result = nullptr;
if (NumLoads > 1) {
// Create multiple legal small ldN.
SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
@@ -17548,35 +17551,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned J = 0; J < Factor; ++J) {
- if (DeinterleavedValues[J])
- DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
- }
+
+ // Merge the values from different factors.
+ Result = PoisonValue::get(DI->getType());
+ for (unsigned J = 0; J < Factor; ++J)
+ Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
} else {
- Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned I = 0; I < Factor; I++) {
- if (DeinterleavedValues[I]) {
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
- }
- }
}
+
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ DI->replaceAllUsesWith(Result);
return true;
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
return false;
}
+ StoreInst *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!Mask && "Unexpected mask on plain store");
VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
const DataLayout &DL = SI->getModule()->getDataLayout();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6afb3c3..713793e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,19 +211,19 @@ public:
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveValues) const override;
bool isLegalAddImmediate(int64_t) const override;
bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 996b0ed..bc57537 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+static bool getGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single offset register.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
+ return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Sort the load instructions according to the lane.
+ llvm::sort(LoadToLaneInstrs,
+ [](const MachineInstr *A, const MachineInstr *B) {
+ return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+ });
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on opcode
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ auto PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+ // a single instruction.
+ auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+ auto OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitLoad->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da..02734866 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 233f42b..08f547a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() {
if (!Sym.isMemtag())
continue;
auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
- (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
- *Ctx.getSubtargetInfo());
+ S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index b2b2b37..0e0e83b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
"Use scratch_* flat memory instructions to access scratch"
>;
+def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
+ "FlatGVSMode",
+ "true",
+ "Have GVS addressing mode with flat_* instructions"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -1112,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+ "HasTanhInsts",
+ "true",
+ "Has v_tanh_f32/f16 instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1954,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureArchitectedSGPRs,
+ FeatureFlatGVSMode,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
FeatureAtomicDsPkAdd16Insts,
@@ -1972,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
+ FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
@@ -2381,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
+ AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
+
def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
@@ -2693,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+ AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 280f87b..3d040fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4843,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+static SDValue
+foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AMDGPUSubtarget *ST) {
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Check if constant should not be optimized - early return if not.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
+
+ // Only optimize normal floating-point values (finite, non-zero, and
+ // non-subnormal as per IEEE 754), skip optimization for inlinable
+ // floating-point constants.
+ if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+
+ // Skip optimization for inlinable integer immediates.
+ // Inlinable immediates include: -16 to 64 (inclusive).
+ if (IntVal >= -16 && IntVal <= 64)
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 44eaebf..9a90787 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -25,6 +25,7 @@ namespace {
class AMDGPUInsertDelayAlu {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
@@ -65,13 +66,16 @@ public:
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
- // Get the delay type for an instruction with the specified TSFlags.
- static DelayType getDelayType(uint64_t TSFlags) {
- if (TSFlags & SIInstrFlags::TRANS)
+ // Get the delay type for a MachineInstr.
+ DelayType getDelayType(const MachineInstr &MI) {
+ if (SIInstrInfo::isTRANS(MI))
return TRANS;
- if (TSFlags & SIInstrFlags::VALU)
+ // WMMA XDL ops are treated the same as TRANS.
+ if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+ return TRANS;
+ if (SIInstrInfo::isVALU(MI))
return VALU;
- if (TSFlags & SIInstrFlags::SALU)
+ if (SIInstrInfo::isSALU(MI))
return SALU;
return OTHER;
}
@@ -368,7 +372,7 @@ public:
continue;
}
- DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ DelayType Type = getDelayType(MI);
if (instructionWaitsForSGPRWrites(MI)) {
auto It = State.find(LastSGPRFromVALU);
@@ -456,12 +460,12 @@ public:
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasDelayAlu())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasDelayAlu())
return false;
- SII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ SII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
SchedModel = &SII->getSchedModel();
// Calculate the delay state for each basic block, iterating until we reach
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a4..31a80e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2284,6 +2284,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
Base::addPostRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(SIShrinkInstructionsPass());
+ addPass(SIPostRABundlerPass());
+}
+
void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
addPass(GCNCreateVOPDPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3c62cd1..3b2f39c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -183,6 +183,7 @@ public:
void addPreEmitPass(AddMachinePass &) const;
Error addRegAssignmentOptimized(AddMachinePass &) const;
void addOptimizedRegAlloc(AddMachinePass &) const;
+ void addPreSched2(AddMachinePass &) const;
/// Check if a pass is enabled given \p Opt option. The option always
/// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3625db9..c8a4e22 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -237,10 +238,18 @@ class FLAT_Load_Pseudo<
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Load_Pseudo_t16<string opName> {
- def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
let True16Predicate = UseRealTrue16Insts in
- def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+ defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
-multiclass FLAT_Store_Pseudo_t16<string opName> {
- def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
- let OtherPredicates = [HasTrue16BitInsts] in
- def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+ def "" : FLAT_Store_Pseudo<opName, regClass>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+
+ defvar Name16 = opName#"_t16";
+ let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
+ def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+ GlobalSaddrTable<0, Name16>,
+ True16D16Table<NAME#"_D16_HI", NAME>;
+ def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ GlobalSaddrTable<1, Name16>,
+ True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
+ }
}
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+ (outs),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo_RTN<
@@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+ RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+ (outs vdst_op:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op:$vdst),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName#"_rtn"> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let PseudoInstr = NAME#"_SADDR_RTN";
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo<
@@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo<
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
}
-def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
}
-defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
-defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
VGPR_32, i32, v2i32, VReg_64>;
@@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
(inst $saddr, $voffset, $offset, 0, $in)
>;
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
+ (inst $saddr, $voffset, $offset, (i32 0), $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
+ (inst $saddr, $voffset, $offset, (i32 0))
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, (i32 0))
@@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, 0)
>;
-class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt> : GCNPat <
+class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
@@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
}
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat <inst, node, vt>;
+
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <inst, node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
- def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
}
foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
} // end foreach as
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+
let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
@@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in {
}
let OtherPredicates = [HasD16LoadStore] in {
-def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
// appropriate waits.
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
VFLAT_Aliases_gfx12<name, alias>,
VFLAT_Real_gfx12<op, name>;
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
- string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = name> :
- VFLAT_Real_Base_gfx12<op, name, alias> {
- defm _RTN : VFLAT_Real_gfx12<op, name>;
-}
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
VFLAT_Real_Base_gfx12<op, name, alias> {
@@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
}
}
-multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Aliases_gfx12<name> {
let DecoderNamespace = "GFX12W64" in {
@@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
}
}
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
- VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+ VFLAT_Real_AllAddr_gfx12<op, name, alias> {
defm _RTN : VFLAT_Real_gfx12<op, name>;
defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
}
@@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
}
// ENC_VFLAT.
-defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
-defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
-defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
-defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
-defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
-defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
-defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
-defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
-defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
-defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
-defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
-defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
-defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
-defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
-defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
-defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
-defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
-defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
-defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
-defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">;
defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
@@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
// ENC_VGLOBAL.
-defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
-defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
-defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
-defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
-defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
-defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
-defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
-defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
-defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
-defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
-defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
-defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
-defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
-defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
-defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
-defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
-defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
-defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
-defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
-defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>;
-defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>;
-
-defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
-defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
-defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
-defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
-defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
-defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
-defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
-defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
-defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
-defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
-defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
-defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
-defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
-defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
-defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
-defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
-defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
-defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
-defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
-defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
-defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
-defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
-defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
-defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
-defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
-defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
-defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>;
-defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
+defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>;
+
+defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>;
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
-defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
-defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
+defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>;
+defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>;
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>;
-defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>;
defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36..a655308 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() {
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
GCNDownwardRPTracker RPTracker(*LIS);
- RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+ RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
+ &LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6843052..268162b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -214,6 +214,7 @@ protected:
bool FlatInstOffsets = false;
bool FlatGlobalInsts = false;
bool FlatScratchInsts = false;
+ bool FlatGVSMode = false;
bool ScalarFlatScratchInsts = false;
bool HasArchitectedFlatScratch = false;
bool EnableFlatScratch = false;
@@ -233,6 +234,7 @@ protected:
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
+ bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1160,6 +1162,8 @@ public:
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ bool hasFlatGVSMode() const { return FlatGVSMode; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -1377,6 +1381,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasTanhInsts() const { return HasTanhInsts; }
+
bool hasAddPC64Inst() const { return GFX1250Insts; }
bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a463..44d9ef5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
default:
return false;
case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfe6f65..27212fda 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9308,7 +9308,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
- Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ Module *M = MF.getFunction().getParent();
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
auto *RelocSymbol = cast<GlobalVariable>(
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359..2af0a57 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
llvm_unreachable("event type has no associated counter");
}
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
-
- bool isSmemCounter(InstCounterType T) const {
- return T == SmemAccessCounter || T == X_CNT;
- }
-
- unsigned getSgprScoresIdx(InstCounterType T) const {
- assert(isSmemCounter(T) && "Invalid SMEM counter");
- return T == X_CNT ? 1 : 0;
- }
-
- unsigned getScoreLB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreLBs[T];
- }
-
- unsigned getScoreUB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreUBs[T];
- }
-
- unsigned getScoreRange(InstCounterType T) const {
- return getScoreUB(T) - getScoreLB(T);
- }
-
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS)
- return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
- }
-
- bool merge(const WaitcntBrackets &Other);
-
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- const MachineOperand &Op) const;
-
- bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
- void determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const;
- void determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- determineWait(T, {RegNo, RegNo + 1}, Wait);
- }
-
- void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
- void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
-
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
- }
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
- return HasPending;
- }
-
- bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
- // Return true if more than one bit is set in Events.
- return Events & (Events - 1);
- }
-
- bool hasPendingFlat() const {
- return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
- LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
- (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
- LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
- }
-
- void setPendingFlat() {
- LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
- LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
- }
-
- bool hasPendingGDS() const {
- return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
- }
-
- unsigned getPendingGDSWait() const {
- return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
- }
-
- void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
- // Return true if there might be pending writes to the vgpr-interval by VMEM
- // instructions with types different from V.
- bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- if (VgprVmemTypes[RegNo] & ~(1 << V))
- return true;
- }
- return false;
- }
-
- void clearVgprVmemTypes(RegInterval Interval) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- VgprVmemTypes[RegNo] = 0;
- }
- }
-
- void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
- }
-
- ArrayRef<const MachineInstr *> getLDSDMAStores() const {
- return LDSDMAStores;
- }
-
- bool hasPointSampleAccel(const MachineInstr &MI) const;
- bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
- RegInterval Interval) const;
-
- void print(raw_ostream &) const;
- void dump() const { print(dbgs()); }
-
-private:
- struct MergeInfo {
- unsigned OldLB;
- unsigned OtherLB;
- unsigned MyShift;
- unsigned OtherShift;
- };
- static bool mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore);
-
- void setScoreLB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreUBs[T] = Val;
-
- if (T != EXP_CNT)
- return;
-
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
- }
-
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- setScoreByInterval({GprNo, GprNo + 1}, T, Val);
- }
-
- void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
- unsigned Score);
-
- void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- const MachineOperand &Op, InstCounterType CntTy,
- unsigned Val);
-
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
- unsigned ScoreLBs[NUM_INST_CNTS] = {0};
- unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
- // Remember the last flat memory operation.
- unsigned LastFlat[NUM_INST_CNTS] = {0};
- // Remember the last GDS operation.
- unsigned LastGDS = 0;
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
- // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
- // X_CNT score.
- unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
- // Store representative LDS DMA operations. The only useful info here is
- // alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
// This abstracts the logic for generating and updating S_WAIT* instructions
// away from the analysis that determines where they are needed. This was
@@ -640,8 +407,13 @@ public:
};
class SIInsertWaitcnts {
+public:
+ const GCNSubtarget *ST;
+ InstCounterType SmemAccessCounter;
+ InstCounterType MaxCounter;
+ const unsigned *WaitEventMaskForInst;
+
private:
- const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -657,8 +429,6 @@ private:
bool Dirty = true;
};
- InstCounterType SmemAccessCounter;
-
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -675,7 +445,7 @@ private:
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+ HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -686,6 +456,30 @@ public:
(void)ForceVMCounter;
}
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ default:
+ break;
+ }
+ return 0;
+ }
+
bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
bool isPreheaderToFlush(MachineBasicBlock &MBB,
const WaitcntBrackets &ScoreBrackets);
@@ -791,6 +585,211 @@ public:
WaitcntBrackets &ScoreBrackets);
};
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+ WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+
+ bool isSmemCounter(InstCounterType T) const {
+ return T == Context->SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ assert(isSmemCounter(T) && "Invalid SMEM counter");
+ return T == X_CNT ? 1 : 0;
+ }
+
+ unsigned getScoreLB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreLBs[T];
+ }
+
+ unsigned getScoreUB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreUBs[T];
+ }
+
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
+ if (GprNo < NUM_ALL_VGPRS)
+ return VgprScores[T][GprNo];
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+ }
+
+ bool merge(const WaitcntBrackets &Other);
+
+ RegInterval getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ const MachineOperand &Op) const;
+
+ bool counterOutOfOrder(InstCounterType T) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+ void determineWait(InstCounterType T, RegInterval Interval,
+ AMDGPU::Waitcnt &Wait) const;
+ void determineWait(InstCounterType T, int RegNo,
+ AMDGPU::Waitcnt &Wait) const {
+ determineWait(T, {RegNo, RegNo + 1}, Wait);
+ }
+
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
+ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, WaitEventType E,
+ MachineInstr &MI);
+
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
+ }
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
+
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = hasPendingEvent(T);
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
+ bool hasPendingFlat() const {
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+ }
+
+ void setPendingFlat() {
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+ }
+
+ bool hasPendingGDS() const {
+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+ }
+
+ unsigned getPendingGDSWait() const {
+ return std::min(getScoreUB(DS_CNT) - LastGDS,
+ Context->getWaitCountMax(DS_CNT) - 1);
+ }
+
+ void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+ // Return true if there might be pending writes to the vgpr-interval by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ if (VgprVmemTypes[RegNo] & ~(1 << V))
+ return true;
+ }
+ return false;
+ }
+
+ void clearVgprVmemTypes(RegInterval Interval) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[RegNo] = 0;
+ }
+ }
+
+ void setStateOnFunctionEntryOrReturn() {
+ setScoreUB(STORE_CNT,
+ getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+ PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
+ }
+
+ bool hasPointSampleAccel(const MachineInstr &MI) const;
+ bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+ RegInterval Interval) const;
+
+ void print(raw_ostream &) const;
+ void dump() const { print(dbgs()); }
+
+private:
+ struct MergeInfo {
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
+ };
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
+
+ void setScoreLB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreLBs[T] = Val;
+ }
+
+ void setScoreUB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreUBs[T] = Val;
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+ }
+
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+ setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+ }
+
+ void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+ unsigned Score);
+
+ void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ const MachineOperand &Op, InstCounterType CntTy,
+ unsigned Val);
+
+ const SIInsertWaitcnts *Context;
+
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
+ // Remember the last flat memory operation.
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
+ // Remember the last GDS operation.
+ unsigned LastGDS = 0;
+ // wait_cnt scores for every vgpr.
+ // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
public:
static char ID;
@@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+ MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
assert(isUInt<8>(RegIdx));
@@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
// this at compile time, so we have to assume it might be applied if the
// instruction supports it).
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
- if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+ if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
return false;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+ InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
void WaitcntBrackets::print(raw_ostream &OS) const {
+ const GCNSubtarget *ST = Context->ST;
+
OS << '\n';
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
unsigned SR = getScoreRange(T);
switch (T) {
@@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// s_waitcnt instruction.
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !ST->hasFlatLgkmVMemCountInOrder()) {
+ !Context->ST->hasFlatLgkmVMemCountInOrder()) {
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
@@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
unsigned NeededWait =
- std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~WaitEventMaskForInst[T];
+ PendingEvents &= ~Context->WaitEventMaskForInst[T];
}
}
@@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
@@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
VgprUB = std::max(VgprUB, Other.VgprUB);
SgprUB = std::max(SgprUB, Other.SgprUB);
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
+ const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
@@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+ WaitEventMaskForInst = WCG->getWaitEventMask();
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- HardwareLimits Limits = {};
if (ST->hasExtendedWaitCounts()) {
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
@@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets) {
- Brackets = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ Brackets = std::make_unique<WaitcntBrackets>(this);
} else {
// Reinitialize in-place. N.B. do not do this by assigning from a
// temporary because the WaitcntBrackets class is large and it could
// cause this function to use an unreasonable amount of stack space.
Brackets->~WaitcntBrackets();
- new (Brackets.get()) WaitcntBrackets(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ new (Brackets.get()) WaitcntBrackets(this);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a1e14d9..9da8a1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6460,7 +6460,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldSAddrIdx < 0)
return false;
- assert(isSegmentSpecificFLAT(Inst));
+ assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
if (NewOpc < 0)
@@ -6484,7 +6484,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldVAddrIdx >= 0) {
MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
- if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+ if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
!VAddrDef->getOperand(1).isImm() ||
VAddrDef->getOperand(1).getImm() != 0)
return false;
@@ -6537,7 +6537,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
- if (!isSegmentSpecificFLAT(MI))
+ if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
return;
// Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
@@ -10466,10 +10466,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
return TargetInstrInfo::isGlobalMemoryObject(MI);
}
+bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
+ if (!isWMMA(MI) && !isSWMMAC(MI))
+ return false;
+
+ if (AMDGPU::isGFX1250(ST))
+ return AMDGPU::getWMMAIsXDL(MI.getOpcode());
+
+ return true;
+}
+
bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
- if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
+ if (AMDGPU::isGFX12Plus(ST))
+ return isDOT(MI) || isXDLWMMA(MI);
+
+ if (!isMAI(MI) || isDGEMM(Opcode) ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a380199..3a48e65 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -867,6 +867,8 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
+ bool isXDLWMMA(const MachineInstr &MI) const;
+
bool isXDL(const MachineInstr &MI) const;
static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); }
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd9..5097ac03 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = MI.getIterator();
++MBBI;
- const SITargetLowering *TLI =
- static_cast<const SITargetLowering *>(STM->getTargetLowering());
+ const SITargetLowering *TLI = STM->getTargetLowering();
for ( ; MBBI != E; ++MBBI) {
MachineInstr &MINext = *MBBI;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9df2bde..7725881 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
#define GET_MIMGOffsetMappingTable_IMPL
#define GET_MIMGG16MappingTable_IMPL
#define GET_MAIInstInfoTable_IMPL
+#define GET_WMMAInstInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info && Info->is_gfx940_xdl;
}
+bool getWMMAIsXDL(unsigned Opc) {
+ const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc);
+ return Info ? Info->is_wmma_xdl : false;
+}
+
uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
switch (EncodingVal) {
case MFMAScaleFormats::FP6_E2M3:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6708e0a..c9d2c28 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -119,6 +119,11 @@ struct True16D16Info {
unsigned LoOp;
};
+struct WMMAInstInfo {
+ uint16_t Opcode;
+ bool is_wmma_xdl;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -129,6 +134,7 @@ struct True16D16Info {
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#define GET_True16D16Table_DECL
+#define GET_WMMAInstInfoTable_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc);
LLVM_READONLY
bool getMAIIsGFX940XDL(unsigned Opc);
+LLVM_READONLY
+bool getWMMAIsXDL(unsigned Opc);
+
// Get an equivalent BitOp3 for a binary logical \p Opc.
// \returns BitOp3 modifier for the logical operation or zero.
// Used in VOPD3 conversion.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index e2f3710..8c35fea 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -531,6 +534,11 @@ let SubtargetPredicate = HasBF16TransInsts in {
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
+defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
+defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
+defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1133,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
+defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
@@ -1141,6 +1150,11 @@ defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
+defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
+defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
+defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 65d1c4e..fd3b052 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
auto T = const_cast<Type*>(CP->getType());
auto C = const_cast<Constant*>(CP->getConstVal());
- auto M = const_cast<Module*>(DAG.getMachineFunction().
- getFunction().getParent());
+ auto M = DAG.getMachineFunction().getFunction().getParent();
auto GV = new GlobalVariable(
*M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
@@ -21585,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21593,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef5..9159f3d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -681,7 +681,7 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index ce43645..f0e2e78 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
Info.RootFlattenedArrayType, Info.RootPointerOperand,
{ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
+ // If the pointer operand is a global variable and all indices are 0,
+ // IRBuilder::CreateGEP will return the global variable instead of creating
+ // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+ // insert our own GEP instruction.
+ if (!isa<GEPOperator>(NewGEP))
+ NewGEP = GetElementPtrInst::Create(
+ Info.RootFlattenedArrayType, Info.RootPointerOperand,
+ {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+ Builder.GetInsertPoint());
+
// Replace the current GEP with the new GEP. Store GEPInfo into the map
// for later use in case this GEP was not the end of the chain
GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c9ff713..c73648f 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
static void
-legalizeLoadStoreOnArrayAllocas(Instruction &I,
+legalizeScalarLoadStoreOnArrays(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I,
} else
return;
- assert(LoadStoreTy->isSingleValueType() &&
- "Expected load/store type to be a single-valued type");
+ // If the load/store is not of a single-value type (i.e., scalar or vector)
+ // then we do not modify it. It shouldn't be a vector either because the
+ // dxil-data-scalarization pass is expected to run before this, but it's not
+ // incorrect to apply this transformation to vector load/stores.
+ if (!LoadStoreTy->isSingleValueType())
+ return;
- auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp);
- if (!AllocaPtrOp)
+ Type *ArrayTy;
+ if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+ ArrayTy = GlobalVarPtrOp->getValueType();
+ else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+ ArrayTy = AllocaPtrOp->getAllocatedType();
+ else
return;
- Type *Ty = AllocaPtrOp->getAllocatedType();
- if (!isa<ArrayType>(Ty))
+ if (!isa<ArrayType>(ArrayTy))
return;
- assert(!isa<ArrayType>(Ty->getArrayElementType()) &&
- "Expected allocated type of AllocaInst to be a flat ArrayType");
- IRBuilder<> Builder(&I);
- Value *Zero = Builder.getInt32(0);
- Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "",
- GEPNoWrapFlags::all());
+ assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+ "Expected array element type to be the same as to the scalar load or "
+ "store type");
+
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+ Value *GEP = GetElementPtrInst::Create(
+ ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
I.setOperand(PtrOpIndex, GEP);
}
@@ -651,7 +659,7 @@ private:
// downcastI64toI32InsertExtractElements needs to handle.
LegalizationPipeline[Stage2].push_back(
downcastI64toI32InsertExtractElements);
- LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas);
+ LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
}
};
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3..e285e04 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
R = N;
break;
}
+ case ISD::AssertSext: {
+ EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+ if (T.getSizeInBits() == 32)
+ R = N.getOperand(0);
+ else
+ return false;
+ break;
+ }
+
default:
return false;
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d5a5f17..36c3011 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file describes the baisc single-precision floating-point instructions.
+// This file describes the basic single-precision floating-point instructions.
//
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987f..2378664 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
EVT VecTy = Op->getOperand(0)->getValueType(0);
SDValue Idx = Op->getOperand(1);
- EVT EltTy = VecTy.getVectorElementType();
unsigned NumElts = VecTy.getVectorNumElements();
- if (isa<ConstantSDNode>(Idx) &&
- (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
- EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
+ if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
return Op;
return SDValue();
@@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
Register ScratchReg1 = XSrc;
if (Idx >= HalfSize) {
ScratchReg1 = MRI.createVirtualRegister(RC);
- BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1)
+ BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
.addReg(XSrc)
- .addReg(XSrc)
- .addImm(1);
+ .addImm(14);
}
Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 95e9fd4..a0107e4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> {
(!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
}
+multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 4))),
+ !add(imm2, 4)),
+ (XVEXTRINS_W $xd, $xj, Imm)>;
+ }
+ }
+}
+
+multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 2))),
+ !add(imm2, 2)),
+ (XVEXTRINS_D $xd, $xj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLASX] in {
// XVADD_{B/H/W/D}
@@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
+// Insert two elements extracted from vector into vector. (The positions
+// of the two elements must be same in the source or destination vector's
+// front and back 128bits.)
+// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D}
+// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v32i8:$xd,
+ (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))),
+ !add(imm2, 16)),
+ (XVEXTRINS_B $xd, $xj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v16i16:$xd,
+ (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))),
+ !add(imm2, 8)),
+ (XVEXTRINS_H $xd, $xj, Imm)>;
+ }
+}
+
+defm : PairInsertExtractPatV8<v8i32, GRLenVT>;
+defm : PairInsertExtractPatV8<v8f32, f32>;
+defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
+defm : PairInsertExtractPatV4<v4f64, f64>;
+
// PseudoXVINSGR2VR_{B/H}
def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
(PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
@@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+ (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+ (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+ (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
def : RegRegStPat<store, XVSTX, LASX256, vt>;
}
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))),
+ (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))),
+ (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>;
+
// Vector extraction with constant index.
+foreach imm = 16...31 in {
+ defvar Imm = !and(imm, 15);
+ def : Pat<(i64 (vector_extract v32i8:$xj, imm)),
+ (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128),
+ Imm)>;
+}
+foreach imm = 8...15 in {
+ defvar Imm = !and(imm, 7);
+ def : Pat<(i64 (vector_extract v16i16:$xj, imm)),
+ (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128),
+ Imm)>;
+}
def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
(VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d780..962e7c2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst,
(Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>;
}
+multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_W $vd, $vj, Imm)>;
+ }
+ }
+}
+
+multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_D $vd, $vj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLSX] in {
// VADD_{B/H/W/D}
@@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
defm : PatCCVrVrF<SETO, "VFCMP_COR">;
defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
+// Insert element extracted from vector into vector.
+// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v16i8:$vd,
+ (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2),
+ (VEXTRINS_B $vd, $vj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v8i16:$vd,
+ (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2),
+ (VEXTRINS_H $vd, $vj, Imm)>;
+ }
+}
+
+defm : InsertExtractPatV4<v4i32, GRLenVT>;
+defm : InsertExtractPatV4<v4f32, f32>;
+defm : InsertExtractPatV2<v2i64, GRLenVT>;
+defm : InsertExtractPatV2<v2f64, f64>;
+
// VINSGR2VR_{B/H/W/D}
def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
(VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
@@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
(VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
(VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm),
+ (VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
+ (VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
(VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
@@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
def : RegRegStPat<store, VSTX, LSX128, vt>;
}
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))),
+ (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))),
+ (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>;
+
// Vector extraction with constant index.
def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)),
(VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 01e4d17..259b71b 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
TOut.getStreamer().emitRelocDirective(
*TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
- RelocJalrExpr, IDLoc, *STI);
+ RelocJalrExpr);
TOut.getStreamer().emitLabel(TmpLabel);
}
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index c18ba44..ca03310 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -166,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
OutStreamer.emitRelocDirective(
*OffsetExpr,
Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
- CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+ CaleeExpr);
OutStreamer.emitLabel(OffsetLabel);
return;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d017c65..7aa06f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1048,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
MVT::v32i32, MVT::v64i32, MVT::v128i32},
Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom);
+ // Enable custom lowering for the following:
+ // * MVT::i128 - clusterlaunchcontrol
+ // * MVT::i32 - prmt
+ // * MVT::Other - internal.addrspace.wrap
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
+ Custom);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
+static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
+ SelectionDAG &DAG,
+ unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
+ {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
+}
+
+static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
+ SelectionDAG &DAG,
+ unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
}
- return DAG.getNode(
- NVPTXISD::PRMT, DL, MVT::v4i8,
- {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ return getPRMT(L, R, SelectionValue, DL, DAG);
};
auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
- return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
+ return DAG.getBitcast(VT, PRMT3210);
}
// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
@@ -2176,11 +2189,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
DAG.getZExtOrTrunc(Index, DL, MVT::i32),
DAG.getConstant(0x7770, DL, MVT::i32));
- SDValue PRMT = DAG.getNode(
- NVPTXISD::PRMT, DL, MVT::i32,
- {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32),
- Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
- return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+ SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
+ DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
+ SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
+ Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
+ Ext->setFlags(Flags);
+ return Ext;
}
// Constant index will be matched by tablegen.
@@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
}
SDLoc DL(Op);
- return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
- DAG.getConstant(Selector, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
+ SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
+ DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
+ return DAG.getBitcast(Op.getValueType(), PRMT);
}
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
@@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
{TryCancelResponse0, TryCancelResponse1});
}
+static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
+ const unsigned Mode = [&]() {
+ switch (Op->getConstantOperandVal(0)) {
+ case Intrinsic::nvvm_prmt:
+ return NVPTX::PTXPrmtMode::NONE;
+ case Intrinsic::nvvm_prmt_b4e:
+ return NVPTX::PTXPrmtMode::B4E;
+ case Intrinsic::nvvm_prmt_ecl:
+ return NVPTX::PTXPrmtMode::ECL;
+ case Intrinsic::nvvm_prmt_ecr:
+ return NVPTX::PTXPrmtMode::ECR;
+ case Intrinsic::nvvm_prmt_f4e:
+ return NVPTX::PTXPrmtMode::F4E;
+ case Intrinsic::nvvm_prmt_rc16:
+ return NVPTX::PTXPrmtMode::RC16;
+ case Intrinsic::nvvm_prmt_rc8:
+ return NVPTX::PTXPrmtMode::RC8;
+ default:
+ llvm_unreachable("unsupported/unhandled intrinsic");
+ }
+ }();
+ SDLoc DL(Op);
+ SDValue A = Op->getOperand(1);
+ SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
+ : DAG.getConstant(0, DL, MVT::i32);
+ SDValue Selector = (Op->op_end() - 1)->get();
+ return getPRMT(A, B, Selector, DL, DAG, Mode);
+}
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
switch (Op->getConstantOperandVal(0)) {
default:
return Op;
+ case Intrinsic::nvvm_prmt:
+ case Intrinsic::nvvm_prmt_b4e:
+ case Intrinsic::nvvm_prmt_ecl:
+ case Intrinsic::nvvm_prmt_ecr:
+ case Intrinsic::nvvm_prmt_f4e:
+ case Intrinsic::nvvm_prmt_rc16:
+ case Intrinsic::nvvm_prmt_rc8:
+ return lowerPrmtIntrinsic(Op, DAG);
case Intrinsic::nvvm_internal_addrspace_wrap:
return Op.getOperand(1);
case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
@@ -5775,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
auto &DAG = DCI.DAG;
- auto PRMT = DAG.getNode(
- NVPTXISD::PRMT, DL, MVT::v4i8,
- {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
- return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
+ auto PRMT =
+ getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
+ (Op1Bytes << 8) | Op0Bytes, DL, DAG);
+ return DAG.getBitcast(VT, PRMT);
}
static SDValue combineADDRSPACECAST(SDNode *N,
@@ -5797,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N,
return SDValue();
}
+// Given a constant selector value and a prmt mode, return the selector value
+// normalized to the generic prmt mode. See the PTX ISA documentation for more
+// details:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+ if (Mode == NVPTX::PTXPrmtMode::NONE)
+ return Selector;
+
+ const unsigned V = Selector.trunc(2).getZExtValue();
+
+ const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
+ unsigned S3) {
+ return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
+ };
+
+ switch (Mode) {
+ case NVPTX::PTXPrmtMode::F4E:
+ return GetSelector(V, V + 1, V + 2, V + 3);
+ case NVPTX::PTXPrmtMode::B4E:
+ return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
+ case NVPTX::PTXPrmtMode::RC8:
+ return GetSelector(V, V, V, V);
+ case NVPTX::PTXPrmtMode::ECL:
+ return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
+ case NVPTX::PTXPrmtMode::ECR:
+ return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
+ case NVPTX::PTXPrmtMode::RC16: {
+ unsigned V1 = (V & 1) << 1;
+ return GetSelector(V1, V1 + 1, V1, V1 + 1);
+ }
+ default:
+ llvm_unreachable("Invalid PRMT mode");
+ }
+}
+
+static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+ // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ APInt BitField = B.concat(A);
+ APInt SelectorVal = getPRMTSelector(Selector, Mode);
+ APInt Result(32, 0);
+ for (unsigned I : llvm::seq(4U)) {
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+ APInt Byte = BitField.extractBits(8, Idx * 8);
+ if (Sign)
+ Byte = Byte.ashr(8);
+ Result.insertBits(Byte, I * 8);
+ }
+ return Result;
+}
+
+static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ // Constant fold PRMT
+ if (isa<ConstantSDNode>(N->getOperand(0)) &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(2)))
+ return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
+ N->getConstantOperandAPInt(1),
+ N->getConstantOperandAPInt(2),
+ N->getConstantOperandVal(3)),
+ SDLoc(N), N->getValueType(0));
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
switch (N->getOpcode()) {
- default: break;
- case ISD::ADD:
- return PerformADDCombine(N, DCI, OptLevel);
- case ISD::FADD:
- return PerformFADDCombine(N, DCI, OptLevel);
- case ISD::MUL:
- return PerformMULCombine(N, DCI, OptLevel);
- case ISD::SHL:
- return PerformSHLCombine(N, DCI, OptLevel);
- case ISD::AND:
- return PerformANDCombine(N, DCI);
- case ISD::UREM:
- case ISD::SREM:
- return PerformREMCombine(N, DCI, OptLevel);
- case ISD::SETCC:
- return PerformSETCCCombine(N, DCI, STI.getSmVersion());
- case ISD::LOAD:
- case NVPTXISD::LoadParamV2:
- case NVPTXISD::LoadV2:
- case NVPTXISD::LoadV4:
- return combineUnpackingMovIntoLoad(N, DCI);
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N, DCI);
- case ISD::STORE:
- case NVPTXISD::StoreV2:
- case NVPTXISD::StoreV4:
- return PerformStoreCombine(N, DCI);
- case ISD::EXTRACT_VECTOR_ELT:
- return PerformEXTRACTCombine(N, DCI);
- case ISD::VSELECT:
- return PerformVSELECTCombine(N, DCI);
- case ISD::BUILD_VECTOR:
- return PerformBUILD_VECTORCombine(N, DCI);
- case ISD::ADDRSPACECAST:
- return combineADDRSPACECAST(N, DCI);
+ default:
+ break;
+ case ISD::ADD:
+ return PerformADDCombine(N, DCI, OptLevel);
+ case ISD::ADDRSPACECAST:
+ return combineADDRSPACECAST(N, DCI);
+ case ISD::AND:
+ return PerformANDCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return PerformBUILD_VECTORCombine(N, DCI);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformEXTRACTCombine(N, DCI);
+ case ISD::FADD:
+ return PerformFADDCombine(N, DCI, OptLevel);
+ case ISD::LOAD:
+ case NVPTXISD::LoadParamV2:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4:
+ return combineUnpackingMovIntoLoad(N, DCI);
+ case ISD::MUL:
+ return PerformMULCombine(N, DCI, OptLevel);
+ case NVPTXISD::PRMT:
+ return combinePRMT(N, DCI, OptLevel);
+ case ISD::SETCC:
+ return PerformSETCCCombine(N, DCI, STI.getSmVersion());
+ case ISD::SHL:
+ return PerformSHLCombine(N, DCI, OptLevel);
+ case ISD::SREM:
+ case ISD::UREM:
+ return PerformREMCombine(N, DCI, OptLevel);
+ case NVPTXISD::StoreParam:
+ case NVPTXISD::StoreParamV2:
+ case NVPTXISD::StoreParamV4:
+ return PerformStoreParamCombine(N, DCI);
+ case ISD::STORE:
+ case NVPTXISD::StoreV2:
+ case NVPTXISD::StoreV4:
+ return PerformStoreCombine(N, DCI);
+ case ISD::VSELECT:
+ return PerformVSELECTCombine(N, DCI);
}
return SDValue();
}
@@ -6387,7 +6511,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
unsigned Mode = Op.getConstantOperandVal(3);
- if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector)
+ if (!Selector)
return;
KnownBits AKnown = DAG.computeKnownBits(A, Depth);
@@ -6396,7 +6520,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
// {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
KnownBits BitField = BKnown.concat(AKnown);
- APInt SelectorVal = Selector->getAPIntValue();
+ APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
APInt Sel = SelectorVal.extractBits(4, I * 4);
unsigned Idx = Sel.getLoBits(3).getZExtValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4eef6c9..a5bb83d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1453,18 +1453,33 @@ let hasSideEffects = false in {
(ins PrmtMode:$mode),
"prmt.b32$mode",
[(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32rir
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins B32:$a, i32imm:$b, B32:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
def PRMT_B32rii
: BasicFlagsNVPTXInst<(outs B32:$d),
(ins B32:$a, i32imm:$b, Hexu32imm:$c),
(ins PrmtMode:$mode),
"prmt.b32$mode",
[(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
- def PRMT_B32rir
+ def PRMT_B32irr
: BasicFlagsNVPTXInst<(outs B32:$d),
- (ins B32:$a, i32imm:$b, B32:$c),
- (ins PrmtMode:$mode),
+ (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>;
+ def PRMT_B32iri
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32iir
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode),
"prmt.b32$mode",
- [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+ [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index bad4c3c..70150bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1047,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
// MISC
//
-class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
- : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
- (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
-
-class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
- : Pat<(prmt_intrinsic i32:$a, i32:$c),
- (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
-
-def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>;
-def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>;
-def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>;
-
-def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>;
-def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>;
-def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>;
-def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
-
-
def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32",
[(int_nvvm_nanosleep imm:$i)]>,
Requires<[hasPTX<63>, hasSM<70>]>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 0f948b2..cfec46d2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3058,17 +3058,28 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
};
if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) {
+ // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
if (LHS.getOpcode() == ISD::ADD &&
- SelectShl(LHS.getOperand(0), Index, Scale) &&
!isa<ConstantSDNode>(LHS.getOperand(1)) &&
isInt<12>(C1->getSExtValue())) {
- // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
- SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
- SDLoc(Addr), VT);
- Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
- LHS.getOperand(1), C1Val),
- 0);
- return true;
+ if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+ SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+ SDLoc(Addr), VT);
+ Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+ LHS.getOperand(0), C1Val),
+ 0);
+ return true;
+ }
+
+ // Add is commutative so we need to check both operands.
+ if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+ SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+ SDLoc(Addr), VT);
+ Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+ LHS.getOperand(1), C1Val),
+ 0);
+ return true;
+ }
}
// Don't match add with constants.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3af729a..e0a8c07 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -429,7 +429,7 @@ public:
bool fallBackToDAGISel(const Instruction &Inst) const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
@@ -437,15 +437,12 @@ public:
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
-
- bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const override;
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveValues) const override;
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index ddfacd9..38cc0ce 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
#include "RISCVISelLowering.h"
#include "RISCVSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -68,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
Intrinsic::riscv_vlseg8_mask};
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+ Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+ Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+ Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+ Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask};
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+ assert(N);
+ if (N == 1)
+ return true;
+
+ using namespace PatternMatch;
+ // Right now we're only recognizing the simplest pattern.
+ uint64_t C;
+ if (match(V, m_CombineOr(m_ConstantInt(C),
+ m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
+ C && C % N == 0)
+ return true;
+
+ if (isPowerOf2_32(N)) {
+ KnownBits KB = llvm::computeKnownBits(V, DL);
+ return KB.countMinTrailingZeros() >= Log2_32(N);
+ }
+
+ return false;
+}
+
/// Lower an interleaved load into a vlsegN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -81,21 +115,49 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Indices.size() == Shuffles.size());
- IRBuilder<> Builder(LI);
-
- const DataLayout &DL = LI->getDataLayout();
+ IRBuilder<> Builder(Load);
+ const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
- if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
- LI->getPointerAddressSpace(), DL))
- return false;
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (auto *LI = dyn_cast<LoadInst>(Load)) {
+ assert(LI->isSimple());
+ Ptr = LI->getPointerOperand();
+ Alignment = LI->getAlign();
+ assert(!Mask && "Unexpected mask on a load\n");
+ Mask = Builder.getAllOnesMask(VTy->getElementCount());
+ VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
+ } else {
+ auto *VPLoad = cast<VPIntrinsic>(Load);
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
+ "Unexpected intrinsic");
+ Ptr = VPLoad->getMemoryPointerParam();
+ Alignment = VPLoad->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
- auto *PtrTy = LI->getPointerOperandType();
- auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+ assert(Mask && "vp.load needs a mask!");
+
+ Value *WideEVL = VPLoad->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, DL, Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ }
+
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ return false;
// If the segment load is going to be performed segment at a time anyways
// and there's only one element used, use a strided load instead. This
@@ -104,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
- Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
-
+ Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+ // Note: Same VL as above, but i32 not xlen due to signature of
+ // vp.strided.load
+ VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+ VTy->getElementCount());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
- CI->addParamAttr(
- 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+ CI->addParamAttr(0,
+ Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
return true;
};
- Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
CallInst *VlsegN = Builder.CreateIntrinsic(
- FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
- {LI->getPointerOperand(), Mask, VL});
+ FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
for (unsigned i = 0; i < Shuffles.size(); i++) {
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -133,18 +192,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
return true;
}
-static const Intrinsic::ID FixedVssegIntrIds[] = {
- Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
- Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
- Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
- Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
- Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
- Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
- Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
- Intrinsic::riscv_vsseg8_mask};
-
/// Lower an interleaved store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
@@ -234,39 +281,15 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
- assert(N);
- if (N == 1)
- return true;
-
- using namespace PatternMatch;
- // Right now we're only recognizing the simplest pattern.
- uint64_t C;
- if (match(V, m_CombineOr(m_ConstantInt(C),
- m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
- C && C % N == 0)
- return true;
-
- if (isPowerOf2_32(N)) {
- KnownBits KB = llvm::computeKnownBits(V, DL);
- return KB.countMinTrailingZeros() >= Log2_32(N);
- }
-
- return false;
-}
-
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveValues) const {
- const unsigned Factor = DeinterleaveValues.size();
+ Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor > 8)
return false;
IRBuilder<> Builder(Load);
- Value *FirstActive =
- *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
- VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+ VectorType *ResVTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = Load->getDataLayout();
auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
@@ -298,10 +321,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
return false;
- VL = Builder.CreateZExt(
- Builder.CreateUDiv(WideEVL,
- ConstantInt::get(WideEVL->getType(), Factor)),
- XLenTy);
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
}
Type *PtrTy = Ptr->getType();
@@ -346,61 +367,74 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
}
- for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
- if (!DIV)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIV->replaceAllUsesWith(NewEV);
- }
-
+ DI->replaceAllUsesWith(Return);
return true;
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
+ Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const {
unsigned Factor = InterleaveValues.size();
if (Factor > 8)
return false;
- assert(SI->isSimple());
- IRBuilder<> Builder(SI);
+ IRBuilder<> Builder(Store);
auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
- auto *PtrTy = SI->getPointerOperandType();
- const DataLayout &DL = SI->getDataLayout();
+ const DataLayout &DL = Store->getDataLayout();
+ Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
- SI->getPointerAddressSpace(), DL))
- return false;
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (auto *SI = dyn_cast<StoreInst>(Store)) {
+ assert(SI->isSimple());
+ Ptr = SI->getPointerOperand();
+ Alignment = SI->getAlign();
+ assert(!Mask && "Unexpected mask on a store");
+ Mask = Builder.getAllOnesMask(InVTy->getElementCount());
+ VL = isa<FixedVectorType>(InVTy)
+ ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ } else {
+ auto *VPStore = cast<VPIntrinsic>(Store);
+ assert(VPStore->getIntrinsicID() == Intrinsic::vp_store &&
+ "Unexpected intrinsic");
+ Ptr = VPStore->getMemoryPointerParam();
+ Alignment = VPStore->getPointerAlignment().value_or(
+ DL.getABITypeAlign(InVTy->getElementType()));
+
+ assert(Mask && "vp.store needs a mask!");
- Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+ Value *WideEVL = VPStore->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, DL, Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ }
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
+ return false;
if (isa<FixedVectorType>(InVTy)) {
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
-
+ Store->getModule(), FixedVssegIntrIds[Factor - 2],
+ {InVTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops(InterleaveValues);
- Value *VL = Builder.CreateElementCount(XLenTy, InVTy->getElementCount());
- Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount());
- Ops.append({SI->getPointerOperand(), Mask, VL});
-
+ Ops.append({Ptr, Mask, VL});
Builder.CreateCall(VssegNFunc, Ops);
return true;
}
unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
- SI->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
+ Store->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
NumElts * SEW / 8),
Factor);
- Value *VL = Constant::getAllOnesValue(XLenTy);
- Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount());
-
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
StoredVal = Builder.CreateIntrinsic(
@@ -408,131 +442,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
{StoredVal, InterleaveValues[i], Builder.getInt32(i)});
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), ScalableVssegIntrIds[Factor - 2],
+ Store->getModule(), ScalableVssegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), VL->getType()});
- Value *Operands[] = {StoredVal, SI->getPointerOperand(), Mask, VL,
+ Value *Operands[] = {StoredVal, Ptr, Mask, VL,
ConstantInt::get(XLenTy, Log2_64(SEW))};
Builder.CreateCall(VssegNFunc, Operands);
return true;
}
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-/// %mask,
-/// i32 %wide.rvl)
-/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.vector.deinterleave2.nxv64i8(
-/// <vscale x 64 x i8> %l)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-/// %rvl = udiv %wide.rvl, 2
-/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-/// <vscale x 32 x i8> undef,
-/// ptr %ptr,
-/// %mask,
-/// i64 %rvl,
-/// i64 1)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
- VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveResults) const {
- const unsigned Factor = DeinterleaveResults.size();
- assert(Mask && "Expect a valid mask");
- assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
- "Unexpected intrinsic");
-
- Value *FirstActive = *llvm::find_if(DeinterleaveResults,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
- auto &DL = Load->getModule()->getDataLayout();
- Align Alignment = Load->getParamAlign(0).value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- if (!isLegalInterleavedAccessType(
- VTy, Factor, Alignment,
- Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(Load);
-
- Value *WideEVL = Load->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
- return false;
-
- auto *PtrTy = Load->getArgOperand(0)->getType();
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
- Value *EVL = Builder.CreateZExt(
- Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
- XLenTy);
-
- Value *Return = nullptr;
- if (isa<FixedVectorType>(VTy)) {
- Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
- {VTy, PtrTy, XLenTy},
- {Load->getArgOperand(0), Mask, EVL});
- } else {
- unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
- unsigned NumElts = VTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- Load->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
- Value *Operands[] = {
- PoisonValue::get(VecTupTy),
- Load->getArgOperand(0),
- Mask,
- EVL,
- ConstantInt::get(XLenTy,
- RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
- ConstantInt::get(XLenTy, Log2_64(SEW))};
-
- CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
- SmallVector<Type *, 8> AggrTypes{Factor, VTy};
- Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
- Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
- for (unsigned i = 0; i < Factor; ++i) {
- Value *VecExtract =
- Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
- Return = Builder.CreateInsertValue(Return, VecExtract, i);
- }
- }
-
- for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
- if (!DIO)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
- }
-
- return true;
-}
-
/// Lower an interleaved vp.store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved vp.store (Factor = 2):
@@ -583,9 +501,9 @@ bool RISCVTargetLowering::lowerInterleavedVPStore(
auto *PtrTy = Store->getArgOperand(1)->getType();
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- Value *EVL = Builder.CreateZExt(
- Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
- XLenTy);
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ Value *EVL =
+ Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
if (isa<FixedVectorType>(VTy)) {
SmallVector<Value *, 8> Operands(InterleaveOperands);
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 05388f2..3e286a7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,6 +13,17 @@
//
//===----------------------------------------------------------------------===//
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ int SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -44,6 +55,19 @@ let BufferSize = 0 in {
// floating point instructions, this model assumes single issue as
// increasing it reduces the gains we saw in performance
def SMX60_FP : ProcResource<1>;
+
+ // Vector pipeline
+ // Single issue for vector store/load instructions
+ def SMX60_VLS : ProcResource<1>;
+
+ // The C908 user manual says: "Vector floating-point units support vector
+ // floating-point computation of different bits. In addition, vector integer
+ // units are added". Developer confirmed it's a separate VIEU
+ def SMX60_VIEU : ProcResource<1>;
+
+ // The C908 user manual says: "The vector execution unit is developed by
+ // extending the floating-point unit", so let's assume single issue for now
+ def SMX60_VFP : ProcResource<1>;
}
//===----------------------------------------------------------------------===//
@@ -232,9 +256,341 @@ let Latency = 4 in {
def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>;
}
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETVL, [SMX60_IEUA]>;
+
+// 7. Vector Loads and Stores
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ // Unit-stride loads and stores
+ defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+
+ // Mask loads and stores
+ defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+
+ // Strided and indexed loads and stores
+ foreach eew = [8, 16, 32, 64] in {
+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
+}
+
+// Segmented loads and stores
+foreach mx = SchedMxList in {
+ foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ // Unit-stride segmented
+ defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Strided/indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
+ }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+ def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
+ def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+
+ def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Vector Integer Division and Remainder
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// Vector Floating-Point Division and Square Root
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListWRed in {
+ foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFWRed in {
+ foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 16. Vector Permutation Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+
+def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+
+// Gather and Compress
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
// Others
def : WriteRes<WriteCSR, [SMX60_IEU]>;
def : WriteRes<WriteNop, [SMX60_IEU]>;
+def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>;
def : ReadAdvance<ReadSingleBit, 0>;
def : ReadAdvance<ReadSingleBitImm, 0>;
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
+foreach mx = SchedMxList in {
+ def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
+ foreach sew = SchedSEWSet<mx>.val in
+ def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>;
+}
+
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbkb;
defm : UnsupportedSchedZbkx;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index c2b5e01..e656e8b 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
}
+ // Vector Register Gather with 16-bit Index Elements Instruction
+ // Dest and source data EEW=SEW. Index vector EEW=16.
+ case RISCV::VRGATHEREI16_VV: {
+ if (MO.getOperandNo() == 2)
+ return 4;
+ return MILog2SEW;
+ }
+
default:
return std::nullopt;
}
@@ -1058,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VSLIDEDOWN_VI:
case RISCV::VSLIDE1UP_VX:
case RISCV::VFSLIDE1UP_VF:
+ // Vector Register Gather Instructions
+ case RISCV::VRGATHER_VI:
+ case RISCV::VRGATHER_VV:
+ case RISCV::VRGATHER_VX:
+ case RISCV::VRGATHEREI16_VV:
// Vector Single-Width Floating-Point Add/Subtract Instructions
case RISCV::VFADD_VF:
case RISCV::VFADD_VV:
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6897865..ea78dcd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro
defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>;
defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>;
defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>;
-defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>;
// GetQuery builtin records:
defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>;
@@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>;
defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>;
defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>;
defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd0bea0..6608b3f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
case Intrinsic::spv_normalize:
return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
+ case Intrinsic::spv_refract:
+ return selectExtInst(ResVReg, ResType, I, GL::Refract);
case Intrinsic::spv_reflect:
return selectExtInst(ResVReg, ResType, I, GL::Reflect);
case Intrinsic::spv_rsqrt:
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f..82e8ce4e 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert(ValVT == MVT::i32 && "Should have i32 parts");
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ assert(PendingMembers.size() == 4 && "Should have two parts");
+
+ int64_t Offset = State.AllocateStack(16, Align(16));
+ PendingMembers[0].convertToMem(Offset);
+ PendingMembers[1].convertToMem(Offset + 4);
+ PendingMembers[2].convertToMem(Offset + 8);
+ PendingMembers[3].convertToMem(Offset + 12);
+
+ State.addLoc(PendingMembers[0]);
+ State.addLoc(PendingMembers[1]);
+ State.addLoc(PendingMembers[2]);
+ State.addLoc(PendingMembers[3]);
+ PendingMembers.clear();
+ return true;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa..f020e0b 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
// The 'nest' parameter, if any, is passed in ECX.
CCIfNest<CCAssignToReg<[ECX]>>,
+ // i128 and fp128 need to be passed on the stack with a higher alignment than
+ // their legal types. Handle this with a custom function.
+ CCIfType<[i32],
+ CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
// On swifttailcc pass swiftself in ECX.
CCIfCC<"CallingConv::SwiftTail",
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a3..2636979 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,7 +1661,7 @@ namespace llvm {
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad3553..b4639ac 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg,
const DataLayout &DL) const {
- // i128 split into i64 needs to be allocated to two consecutive registers,
- // or spilled to the stack as a whole.
- return Ty->isIntegerTy(128);
+ // On x86-64 i128 is split into two i64s and needs to be allocated to two
+ // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+ // is split to four i32s and never actually passed in registers, but we use
+ // the consecutive register mark to match it in TableGen.
+ if (Ty->isIntegerTy(128))
+ return true;
+
+ // On x86-32, fp128 acts the same as i128.
+ if (Subtarget.is32Bit() && Ty->isFP128Ty())
+ return true;
+
+ return false;
}
/// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3..360293bc 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// number of shuffles and ISA.
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
// Create an interleaved access group.
IRBuilder<> Builder(LI);
X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,