diff options
Diffstat (limited to 'llvm/lib/Target')
49 files changed, 2239 insertions, 858 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 7de66cc..12fc976 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( llvm_unreachable("Unsupported ElementSize"); } + // Preserve undef state until DOP's reg is defined. + unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0; + // // Create the destructive operation (if required) // @@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(PredIdx).getReg()) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + DOPRegState = 0; // Create the additional LSL to zero the lanes when the DstReg is not // unique. Zeros the lanes in z0 that aren't active in p0 with sequence @@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( assert(DOPRegIsUnique && "The destructive operand should be unique"); PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); DOPIdx = 0; + DOPRegState = 0; } // @@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( // DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + DOPRegState = DOPRegState | RegState::Kill; switch (DType) { case AArch64::DestructiveUnaryPassthru: - DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(PredIdx)) .add(MI.getOperand(SrcIdx)); break; @@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) - .add(MI.getOperand(SrcIdx)); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) + .add(MI.getOperand(SrcIdx)); break; case AArch64::DestructiveTernaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(SrcIdx)) .add(MI.getOperand(Src2Idx)); break; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4f13a14..d04e6c4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); @@ -17486,9 +17491,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleavedValues) const { - unsigned Factor = DeinterleavedValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; @@ -17498,9 +17502,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( return false; assert(!Mask && "Unexpected mask on a load\n"); - Value *FirstActive = *llvm::find_if(DeinterleavedValues, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); + VectorType *VTy = getDeinterleavedVectorType(DI); const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; @@ -17528,6 +17530,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); + Value *Result = nullptr; if (NumLoads > 1) { // Create multiple legal small ldN. SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy)); @@ -17548,35 +17551,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned J = 0; J < Factor; ++J) { - if (DeinterleavedValues[J]) - DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); - } + + // Merge the values from different factors. + Result = PoisonValue::get(DI->getType()); + for (unsigned J = 0; J < Factor; ++J) + Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J); } else { - Value *Result; if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned I = 0; I < Factor; I++) { - if (DeinterleavedValues[I]) { - Value *NewExtract = Builder.CreateExtractValue(Result, I); - DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); - } - } } + + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + DI->replaceAllUsesWith(Result); return true; } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleavedValues) const { + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleavedValues) const { unsigned Factor = InterleavedValues.size(); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } + StoreInst *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!Mask && "Unexpected mask on plain store"); VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType()); const DataLayout &DL = SI->getModule()->getDataLayout(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6afb3c3..713793e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,19 +211,19 @@ public: unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 996b0ed..bc57537 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq<unsigned>(1, NumLanes - 1); + SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<Register, unsigned> &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector<MachineInstr *, 16> LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da..02734866 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 233f42b..08f547a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() { if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); - (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(), - *Ctx.getSubtargetInfo()); + S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b2b2b37..0e0e83b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; +def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", + "FlatGVSMode", + "true", + "Have GVS addressing mode with flat_* instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -1112,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1954,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, + FeatureFlatGVSMode, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicDsPkAdd16Insts, @@ -1972,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2381,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; +def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, + AssemblerPredicate<(all_of FeatureFlatGVSMode)>; + def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; @@ -2693,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 280f87b..3d040fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4843,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Detect when CMP and SELECT use the same constant and fold them to avoid +// loading the constant twice. Specifically handles patterns like: +// %cmp = icmp eq i32 %val, 4242 +// %sel = select i1 %cmp, i32 4242, i32 %other +// It can be optimized to reuse %val instead of 4242 in select. +static SDValue +foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AMDGPUSubtarget *ST) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Check if constant should not be optimized - early return if not. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); + + // Only optimize normal floating-point values (finite, non-zero, and + // non-subnormal as per IEEE 754), skip optimization for inlinable + // floating-point constants. + if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); + + // Skip optimization for inlinable integer immediates. + // Inlinable immediates include: -16 to 64 (inclusive). + if (IntVal >= -16 && IntVal <= 64) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 44eaebf..9a90787 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -25,6 +25,7 @@ namespace { class AMDGPUInsertDelayAlu { public: + const GCNSubtarget *ST; const SIInstrInfo *SII; const TargetRegisterInfo *TRI; @@ -65,13 +66,16 @@ public: // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; - // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + // Get the delay type for a MachineInstr. + DelayType getDelayType(const MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // WMMA XDL ops are treated the same as TRANS. + if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -368,7 +372,7 @@ public: continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForSGPRWrites(MI)) { auto It = State.find(LastSGPRFromVALU); @@ -456,12 +460,12 @@ public: LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() << "\n"); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasDelayAlu()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasDelayAlu()) return false; - SII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + SII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SchedModel = &SII->getSchedModel(); // Calculate the delay state for each basic block, iterating until we reach diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a4..31a80e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2284,6 +2284,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { Base::addPostRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIShrinkInstructionsPass()); + addPass(SIPostRABundlerPass()); +} + void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3c62cd1..3b2f39c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -183,6 +183,7 @@ public: void addPreEmitPass(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; + void addPreSched2(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3625db9..c8a4e22 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{95-72} = !if(ps.has_offset, offset, ?); } +// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode. class GlobalSaddrTable <bit is_saddr, string Name = ""> { bit IsSaddr = is_saddr; string SaddrOp = Name; @@ -237,10 +238,18 @@ class FLAT_Load_Pseudo< let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Load_Pseudo_t16<string opName> { - def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>; +multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>; let True16Predicate = UseRealTrue16Insts in - def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; + defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, @@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Store_Pseudo_t16<string opName> { - def "" : FLAT_Store_Pseudo<opName, VGPR_32>; - let OtherPredicates = [HasTrue16BitInsts] in - def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>; +multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { + def "" : FLAT_Store_Pseudo<opName, regClass>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Store_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { + def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_D16_HI", NAME>; + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; + } } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo_RTN< @@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, + RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), + (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName#"_rtn"> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo< @@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } -defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; -defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; +defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp (inst $saddr, $voffset, $offset, 0, $in) >; +class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, (i32 0), $in) +>; + +class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, (i32 0)) @@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $vaddr, $offset) >; -class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt> : GCNPat < +class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, + ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; @@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp let AddedComplexity = 10; } - def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat <inst, node, vt>; + + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16_t16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <inst, node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let True16Predicate = p in { - def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts -def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>; -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>; foreach vt = Reg32Types.types in { -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>; foreach vt = VReg_128.RegTypes in { -def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; -def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } // end foreach as +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; +defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; @@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; -def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; } let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Handle atomic loads -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>; // appropriate waits. defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; @@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op, VFLAT_Aliases_gfx12<name, alias>, VFLAT_Real_gfx12<op, name>; -multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, - string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = name> : - VFLAT_Real_Base_gfx12<op, name, alias> { - defm _RTN : VFLAT_Real_gfx12<op, name>; -} - -multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : VFLAT_Real_Base_gfx12<op, name, alias> { @@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> { } } -multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : VFLAT_Aliases_gfx12<name> { let DecoderNamespace = "GFX12W64" in { @@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, } } -multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : - VGLOBAL_Real_AllAddr_gfx12<op, name, alias> { + VFLAT_Real_AllAddr_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>; } @@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, } // ENC_VFLAT. -defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">; -defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">; -defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">; -defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">; -defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">; -defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">; -defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">; -defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">; -defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">; -defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">; -defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">; -defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">; -defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">; -defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">; -defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">; -defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">; -defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">; -defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">; -defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">; -defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">; -defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">; -defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">; +defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">; +defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">; +defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">; +defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">; +defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">; +defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">; +defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">; +defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">; +defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">; +defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">; +defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">; +defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">; +defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">; +defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">; +defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">; +defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">; +defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">; +defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">; +defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">; +defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">; +defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">; +defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">; defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">; defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">; defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">; @@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. -defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">; -defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">; -defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">; -defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">; -defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">; -defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">; -defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">; -defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">; -defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">; -defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">; -defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">; -defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">; -defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">; -defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">; -defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; -defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; -defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; -defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; -defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; -defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; -defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; -defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; -defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; - -defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; -defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; -defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; -defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; -defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; -defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; -defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; -defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; -defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; -defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; -defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; -defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; -defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; -defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; -defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; -defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; -defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; -defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; -defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; -defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; -defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; -defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; -defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; -defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; -defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; -defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; -defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; -defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>; -defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; -defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; +defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">; +defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">; +defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">; +defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">; +defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">; +defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">; +defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">; +defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">; +defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">; +defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">; +defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">; +defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">; +defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">; +defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">; +defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; +defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; +defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; +defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; +defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; +defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; +defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>; + +defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; +defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; +defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; +defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; +defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; +defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; +defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; +defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; +defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; +defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; +defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; +defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; +defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; +defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; +defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; +defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; +defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; +defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; +defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; +defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; +defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; +defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; +defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; +defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; +defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>; +defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>; defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>; -defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>; -defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>; +defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>; +defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; -defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36..a655308 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, + &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6843052..268162b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -214,6 +214,7 @@ protected: bool FlatInstOffsets = false; bool FlatGlobalInsts = false; bool FlatScratchInsts = false; + bool FlatGVSMode = false; bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; @@ -233,6 +234,7 @@ protected: bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1160,6 +1162,8 @@ public: bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1377,6 +1381,8 @@ public: return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9b5a463..44d9ef5 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, default: return false; case AMDGPU::V_MOV_B32_e32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65..27212fda 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9308,7 +9308,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast<Module *>(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast<GlobalVariable>( diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359..2af0a57 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef<const MachineInstr *> getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -640,8 +407,13 @@ public: }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -657,8 +429,6 @@ private: bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -675,7 +445,7 @@ private: // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -686,6 +456,30 @@ public: (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -791,6 +585,211 @@ public: WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + + bool isSmemCounter(InstCounterType T) const { + return T == Context->SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const SIInsertWaitcnts *Context; + + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; @@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique<WaitcntBrackets>(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a1e14d9..9da8a1c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6460,7 +6460,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldSAddrIdx < 0) return false; - assert(isSegmentSpecificFLAT(Inst)); + assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode())); int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); if (NewOpc < 0) @@ -6484,7 +6484,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldVAddrIdx >= 0) { MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); - if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + if (!VAddrDef || !VAddrDef->isMoveImmediate() || !VAddrDef->getOperand(1).isImm() || VAddrDef->getOperand(1).getImm() != 0) return false; @@ -6537,7 +6537,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { - if (!isSegmentSpecificFLAT(MI)) + if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode()) return; // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence @@ -10466,10 +10466,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { return TargetInstrInfo::isGlobalMemoryObject(MI); } +bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { + if (!isWMMA(MI) && !isSWMMAC(MI)) + return false; + + if (AMDGPU::isGFX1250(ST)) + return AMDGPU::getWMMAIsXDL(MI.getOpcode()); + + return true; +} + bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || + if (AMDGPU::isGFX12Plus(ST)) + return isDOT(MI) || isXDLWMMA(MI); + + if (!isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a380199..3a48e65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -867,6 +867,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + bool isXDLWMMA(const MachineInstr &MI) const; + bool isXDL(const MachineInstr &MI) const; static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd9..5097ac03 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = MI.getIterator(); ++MBBI; - const SITargetLowering *TLI = - static_cast<const SITargetLowering *>(STM->getTargetLowering()); + const SITargetLowering *TLI = STM->getTargetLowering(); for ( ; MBBI != E; ++MBBI) { MachineInstr &MINext = *MBBI; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9df2bde..7725881 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #define GET_MAIInstInfoTable_IMPL +#define GET_WMMAInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info && Info->is_gfx940_xdl; } +bool getWMMAIsXDL(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info ? Info->is_wmma_xdl : false; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6708e0a..c9d2c28 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -119,6 +119,11 @@ struct True16D16Info { unsigned LoOp; }; +struct WMMAInstInfo { + uint16_t Opcode; + bool is_wmma_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -129,6 +134,7 @@ struct True16D16Info { #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #define GET_True16D16Table_DECL +#define GET_WMMAInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +LLVM_READONLY +bool getWMMAIsXDL(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e2f3710..8c35fea 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -531,6 +534,11 @@ let SubtargetPredicate = HasBF16TransInsts in { defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1133,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; +defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; @@ -1141,6 +1150,11 @@ defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 65d1c4e..fd3b052 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); auto T = const_cast<Type*>(CP->getType()); auto C = const_cast<Constant*>(CP->getConstVal()); - auto M = const_cast<Module*>(DAG.getMachineFunction(). - getFunction().getParent()); + auto M = DAG.getMachineFunction().getFunction().getParent(); auto GV = new GlobalVariable( *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + @@ -21585,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21593,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef5..9159f3d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -681,7 +681,7 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index ce43645..f0e2e78 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { Info.RootFlattenedArrayType, Info.RootPointerOperand, {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags()); + // If the pointer operand is a global variable and all indices are 0, + // IRBuilder::CreateGEP will return the global variable instead of creating + // a GEP instruction or GEP ConstantExpr. In this case we have to create and + // insert our own GEP instruction. + if (!isa<GEPOperator>(NewGEP)) + NewGEP = GetElementPtrInst::Create( + Info.RootFlattenedArrayType, Info.RootPointerOperand, + {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(), + Builder.GetInsertPoint()); + // Replace the current GEP with the new GEP. Store GEPInfo into the map // for later use in case this GEP was not the end of the chain GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)}); diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index c9ff713..c73648f 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I, } static void -legalizeLoadStoreOnArrayAllocas(Instruction &I, +legalizeScalarLoadStoreOnArrays(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I, } else return; - assert(LoadStoreTy->isSingleValueType() && - "Expected load/store type to be a single-valued type"); + // If the load/store is not of a single-value type (i.e., scalar or vector) + // then we do not modify it. It shouldn't be a vector either because the + // dxil-data-scalarization pass is expected to run before this, but it's not + // incorrect to apply this transformation to vector load/stores. + if (!LoadStoreTy->isSingleValueType()) + return; - auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp); - if (!AllocaPtrOp) + Type *ArrayTy; + if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp)) + ArrayTy = GlobalVarPtrOp->getValueType(); + else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp)) + ArrayTy = AllocaPtrOp->getAllocatedType(); + else return; - Type *Ty = AllocaPtrOp->getAllocatedType(); - if (!isa<ArrayType>(Ty)) + if (!isa<ArrayType>(ArrayTy)) return; - assert(!isa<ArrayType>(Ty->getArrayElementType()) && - "Expected allocated type of AllocaInst to be a flat ArrayType"); - IRBuilder<> Builder(&I); - Value *Zero = Builder.getInt32(0); - Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "", - GEPNoWrapFlags::all()); + assert(ArrayTy->getArrayElementType() == LoadStoreTy && + "Expected array element type to be the same as to the scalar load or " + "store type"); + + Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0); + Value *GEP = GetElementPtrInst::Create( + ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); I.setOperand(PtrOpIndex, GEP); } @@ -651,7 +659,7 @@ private: // downcastI64toI32InsertExtractElements needs to handle. LegalizationPipeline[Stage2].push_back( downcastI64toI32InsertExtractElements); - LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas); + LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays); } }; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 53943de3..e285e04 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { R = N; break; } + case ISD::AssertSext: { + EVT T = cast<VTSDNode>(N.getOperand(1))->getVT(); + if (T.getSizeInBits() == 32) + R = N.getOperand(0); + else + return false; + break; + } + default: return false; } diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d5a5f17..36c3011 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file describes the baisc single-precision floating-point instructions. +// This file describes the basic single-precision floating-point instructions. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index c47987f..2378664 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VecTy = Op->getOperand(0)->getValueType(0); SDValue Idx = Op->getOperand(1); - EVT EltTy = VecTy.getVectorElementType(); unsigned NumElts = VecTy.getVectorNumElements(); - if (isa<ConstantSDNode>(Idx) && - (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || - EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2)) + if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts) return Op; return SDValue(); @@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register ScratchReg1 = XSrc; if (Idx >= HalfSize) { ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1) + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) .addReg(XSrc) - .addReg(XSrc) - .addImm(1); + .addImm(14); } Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 95e9fd4..a0107e4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> { (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>; } +multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 4))), + !add(imm2, 4)), + (XVEXTRINS_W $xd, $xj, Imm)>; + } + } +} + +multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 2))), + !add(imm2, 2)), + (XVEXTRINS_D $xd, $xj, Imm)>; + } + } +} + let Predicates = [HasExtLASX] in { // XVADD_{B/H/W/D} @@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">; defm : PatCCXrXrF<SETO, "XVFCMP_COR">; defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">; +// Insert two elements extracted from vector into vector. (The positions +// of the two elements must be same in the source or destination vector's +// front and back 128bits.) +// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D} +// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v32i8:$xd, + (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2), + (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))), + !add(imm2, 16)), + (XVEXTRINS_B $xd, $xj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v16i16:$xd, + (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2), + (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))), + !add(imm2, 8)), + (XVEXTRINS_H $xd, $xj, Imm)>; + } +} + +defm : PairInsertExtractPatV8<v8i32, GRLenVT>; +defm : PairInsertExtractPatV8<v8f32, f32>; +defm : PairInsertExtractPatV4<v4i64, GRLenVT>; +defm : PairInsertExtractPatV4<v4f64, f64>; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; @@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; - -def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; +def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), + (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), + (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), @@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in { def : RegRegStPat<store, XVSTX, LASX256, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))), + (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))), + (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>; + // Vector extraction with constant index. +foreach imm = 16...31 in { + defvar Imm = !and(imm, 15); + def : Pat<(i64 (vector_extract v32i8:$xj, imm)), + (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128), + Imm)>; +} +foreach imm = 8...15 in { + defvar Imm = !and(imm, 7); + def : Pat<(i64 (vector_extract v16i16:$xj, imm)), + (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128), + Imm)>; +} def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)), (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>; def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d73d780..962e7c2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst, (Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>; } +multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_W $vd, $vj, Imm)>; + } + } +} + +multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_D $vd, $vj, Imm)>; + } + } +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">; defm : PatCCVrVrF<SETO, "VFCMP_COR">; defm : PatCCVrVrF<SETUO, "VFCMP_CUN">; +// Insert element extracted from vector into vector. +// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v16i8:$vd, + (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2), + (VEXTRINS_B $vd, $vj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v8i16:$vd, + (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2), + (VEXTRINS_H $vd, $vj, Imm)>; + } +} + +defm : InsertExtractPatV4<v4i32, GRLenVT>; +defm : InsertExtractPatV4<v4f32, f32>; +defm : InsertExtractPatV2<v2i64, GRLenVT>; +defm : InsertExtractPatV2<v2f64, f64>; + // VINSGR2VR_{B/H/W/D} def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm), (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>; @@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm), (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>; def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm), (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>; - +def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm), + (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), + (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), @@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { def : RegRegStPat<store, VSTX, LSX128, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))), + (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))), + (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>; + // Vector extraction with constant index. def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)), (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>; diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 01e4d17..259b71b 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, TOut.getStreamer().emitRelocDirective( *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - RelocJalrExpr, IDLoc, *STI); + RelocJalrExpr); TOut.getStreamer().emitLabel(TmpLabel); } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index c18ba44..ca03310 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -166,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, OutStreamer.emitRelocDirective( *OffsetExpr, Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo()); + CaleeExpr); OutStreamer.emitLabel(OffsetLabel); return; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d017c65..7aa06f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1048,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, MVT::v32i32, MVT::v64i32, MVT::v128i32}, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom); + // Enable custom lowering for the following: + // * MVT::i128 - clusterlaunchcontrol + // * MVT::i32 - prmt + // * MVT::Other - internal.addrspace.wrap + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other}, + Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, + {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); +} + +static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32); R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32); } - return DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + return getPRMT(L, R, SelectionValue, DL, DAG); }; auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340); auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340); auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + return DAG.getBitcast(VT, PRMT3210); } // Get value or the Nth operand as an APInt(32). Undef values treated as 0. @@ -2176,11 +2189,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32, DAG.getZExtOrTrunc(Index, DL, MVT::i32), DAG.getConstant(0x7770, DL, MVT::i32)); - SDValue PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::i32, - {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32), - Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector), + DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG); + SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDNodeFlags Flags; + Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8); + Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8); + Ext->setFlags(Flags); + return Ext; } // Constant index will be matched by tablegen. @@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } SDLoc DL(Op); - return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, - DAG.getConstant(Selector, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1), + DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG); + return DAG.getBitcast(Op.getValueType(), PRMT); } /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift @@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } +static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { + const unsigned Mode = [&]() { + switch (Op->getConstantOperandVal(0)) { + case Intrinsic::nvvm_prmt: + return NVPTX::PTXPrmtMode::NONE; + case Intrinsic::nvvm_prmt_b4e: + return NVPTX::PTXPrmtMode::B4E; + case Intrinsic::nvvm_prmt_ecl: + return NVPTX::PTXPrmtMode::ECL; + case Intrinsic::nvvm_prmt_ecr: + return NVPTX::PTXPrmtMode::ECR; + case Intrinsic::nvvm_prmt_f4e: + return NVPTX::PTXPrmtMode::F4E; + case Intrinsic::nvvm_prmt_rc16: + return NVPTX::PTXPrmtMode::RC16; + case Intrinsic::nvvm_prmt_rc8: + return NVPTX::PTXPrmtMode::RC8; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + SDLoc DL(Op); + SDValue A = Op->getOperand(1); + SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2) + : DAG.getConstant(0, DL, MVT::i32); + SDValue Selector = (Op->op_end() - 1)->get(); + return getPRMT(A, B, Selector, DL, DAG, Mode); +} static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: return Op; + case Intrinsic::nvvm_prmt: + case Intrinsic::nvvm_prmt_b4e: + case Intrinsic::nvvm_prmt_ecl: + case Intrinsic::nvvm_prmt_ecr: + case Intrinsic::nvvm_prmt_f4e: + case Intrinsic::nvvm_prmt_rc16: + case Intrinsic::nvvm_prmt_rc8: + return lowerPrmtIntrinsic(Op, DAG); case Intrinsic::nvvm_internal_addrspace_wrap: return Op.getOperand(1); case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: @@ -5775,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto &DAG = DCI.DAG; - auto PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT); + auto PRMT = + getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1), + (Op1Bytes << 8) | Op0Bytes, DL, DAG); + return DAG.getBitcast(VT, PRMT); } static SDValue combineADDRSPACECAST(SDNode *N, @@ -5797,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N, return SDValue(); } +// Given a constant selector value and a prmt mode, return the selector value +// normalized to the generic prmt mode. See the PTX ISA documentation for more +// details: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + if (Mode == NVPTX::PTXPrmtMode::NONE) + return Selector; + + const unsigned V = Selector.trunc(2).getZExtValue(); + + const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2, + unsigned S3) { + return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12)); + }; + + switch (Mode) { + case NVPTX::PTXPrmtMode::F4E: + return GetSelector(V, V + 1, V + 2, V + 3); + case NVPTX::PTXPrmtMode::B4E: + return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7); + case NVPTX::PTXPrmtMode::RC8: + return GetSelector(V, V, V, V); + case NVPTX::PTXPrmtMode::ECL: + return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U); + case NVPTX::PTXPrmtMode::ECR: + return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V); + case NVPTX::PTXPrmtMode::RC16: { + unsigned V1 = (V & 1) << 1; + return GetSelector(V1, V1 + 1, V1, V1 + 1); + } + default: + llvm_unreachable("Invalid PRMT mode"); + } +} + +static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + APInt BitField = B.concat(A); + APInt SelectorVal = getPRMTSelector(Selector, Mode); + APInt Result(32, 0); + for (unsigned I : llvm::seq(4U)) { + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + APInt Byte = BitField.extractBits(8, Idx * 8); + if (Sign) + Byte = Byte.ashr(8); + Result.insertBits(Byte, I * 8); + } + return Result; +} + +static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + // Constant fold PRMT + if (isa<ConstantSDNode>(N->getOperand(0)) && + isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) + return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0), + N->getConstantOperandAPInt(1), + N->getConstantOperandAPInt(2), + N->getConstantOperandVal(3)), + SDLoc(N), N->getValueType(0)); + + return SDValue(); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); switch (N->getOpcode()) { - default: break; - case ISD::ADD: - return PerformADDCombine(N, DCI, OptLevel); - case ISD::FADD: - return PerformFADDCombine(N, DCI, OptLevel); - case ISD::MUL: - return PerformMULCombine(N, DCI, OptLevel); - case ISD::SHL: - return PerformSHLCombine(N, DCI, OptLevel); - case ISD::AND: - return PerformANDCombine(N, DCI); - case ISD::UREM: - case ISD::SREM: - return PerformREMCombine(N, DCI, OptLevel); - case ISD::SETCC: - return PerformSETCCCombine(N, DCI, STI.getSmVersion()); - case ISD::LOAD: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadV2: - case NVPTXISD::LoadV4: - return combineUnpackingMovIntoLoad(N, DCI); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); - case ISD::STORE: - case NVPTXISD::StoreV2: - case NVPTXISD::StoreV4: - return PerformStoreCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACTCombine(N, DCI); - case ISD::VSELECT: - return PerformVSELECTCombine(N, DCI); - case ISD::BUILD_VECTOR: - return PerformBUILD_VECTORCombine(N, DCI); - case ISD::ADDRSPACECAST: - return combineADDRSPACECAST(N, DCI); + default: + break; + case ISD::ADD: + return PerformADDCombine(N, DCI, OptLevel); + case ISD::ADDRSPACECAST: + return combineADDRSPACECAST(N, DCI); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::BUILD_VECTOR: + return PerformBUILD_VECTORCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACTCombine(N, DCI); + case ISD::FADD: + return PerformFADDCombine(N, DCI, OptLevel); + case ISD::LOAD: + case NVPTXISD::LoadParamV2: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: + return combineUnpackingMovIntoLoad(N, DCI); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case NVPTXISD::PRMT: + return combinePRMT(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI, STI.getSmVersion()); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::SREM: + case ISD::UREM: + return PerformREMCombine(N, DCI, OptLevel); + case NVPTXISD::StoreParam: + case NVPTXISD::StoreParamV2: + case NVPTXISD::StoreParamV4: + return PerformStoreParamCombine(N, DCI); + case ISD::STORE: + case NVPTXISD::StoreV2: + case NVPTXISD::StoreV4: + return PerformStoreCombine(N, DCI); + case ISD::VSELECT: + return PerformVSELECTCombine(N, DCI); } return SDValue(); } @@ -6387,7 +6511,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2)); unsigned Mode = Op.getConstantOperandVal(3); - if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector) + if (!Selector) return; KnownBits AKnown = DAG.computeKnownBits(A, Depth); @@ -6396,7 +6520,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} KnownBits BitField = BKnown.concat(AKnown); - APInt SelectorVal = Selector->getAPIntValue(); + APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { APInt Sel = SelectorVal.extractBits(4, I * 4); unsigned Idx = Sel.getLoBits(3).getZExtValue(); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4eef6c9..a5bb83d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1453,18 +1453,33 @@ let hasSideEffects = false in { (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32rir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins B32:$a, i32imm:$b, B32:$c), + (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; def PRMT_B32rii : BasicFlagsNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b, Hexu32imm:$c), (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>; - def PRMT_B32rir + def PRMT_B32irr : BasicFlagsNVPTXInst<(outs B32:$d), - (ins B32:$a, i32imm:$b, B32:$c), - (ins PrmtMode:$mode), + (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>; + def PRMT_B32iri + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32iir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode), "prmt.b32$mode", - [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; + [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bad4c3c..70150bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1047,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass, // MISC // -class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c), - (PRMT_B32rrr $a, $b, $c, prmt_mode)>; - -class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$c), - (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>; - -def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>; -def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>; -def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>; - -def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>; -def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>; -def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>; -def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>; - - def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32", [(int_nvvm_nanosleep imm:$i)]>, Requires<[hasPTX<63>, hasSM<70>]>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 0f948b2..cfec46d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3058,17 +3058,28 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, }; if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) { + // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if (LHS.getOpcode() == ISD::ADD && - SelectShl(LHS.getOperand(0), Index, Scale) && !isa<ConstantSDNode>(LHS.getOperand(1)) && isInt<12>(C1->getSExtValue())) { - // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) - SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), - SDLoc(Addr), VT); - Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, - LHS.getOperand(1), C1Val), - 0); - return true; + if (SelectShl(LHS.getOperand(1), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(0), C1Val), + 0); + return true; + } + + // Add is commutative so we need to check both operands. + if (SelectShl(LHS.getOperand(0), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(1), C1Val), + 0); + return true; + } } // Don't match add with constants. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3af729a..e0a8c07 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ public: bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; @@ -437,15 +437,12 @@ public: bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; - - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveRes) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef<Value *> InterleaveOps) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index ddfacd9..38cc0ce 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -14,6 +14,7 @@ #include "RISCVISelLowering.h" #include "RISCVSubtarget.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -68,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, Intrinsic::riscv_vlseg8_mask}; +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_NUWMul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -81,21 +115,49 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (auto *LI = dyn_cast<LoadInst>(Load)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load\n"); + Mask = Builder.getAllOnesMask(VTy->getElementCount()); + VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); + } else { + auto *VPLoad = cast<VPIntrinsic>(Load); + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + Ptr = VPLoad->getMemoryPointerParam(); + Alignment = VPLoad->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + assert(Mask && "vp.load needs a mask!"); + + Value *WideEVL = VPLoad->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -104,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -133,18 +192,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( return true; } -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - /// Lower an interleaved store into a vssegN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -234,39 +281,15 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_c_Mul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) - return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); - } - - return false; -} - bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const { - const unsigned Factor = DeinterleaveValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor > 8) return false; IRBuilder<> Builder(Load); - Value *FirstActive = - *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); - VectorType *ResVTy = cast<VectorType>(FirstActive->getType()); + VectorType *ResVTy = getDeinterleavedVectorType(DI); const DataLayout &DL = Load->getDataLayout(); auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); @@ -298,10 +321,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) return false; - VL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, - ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); } Type *PtrTy = Ptr->getType(); @@ -346,61 +367,74 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } } - for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { - if (!DIV) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIV->replaceAllUsesWith(NewEV); - } - + DI->replaceAllUsesWith(Return); return true; } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const { + Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const { unsigned Factor = InterleaveValues.size(); if (Factor > 8) return false; - assert(SI->isSimple()); - IRBuilder<> Builder(SI); + IRBuilder<> Builder(Store); auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType()); - auto *PtrTy = SI->getPointerOperandType(); - const DataLayout &DL = SI->getDataLayout(); + const DataLayout &DL = Store->getDataLayout(); + Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) - return false; + Value *Ptr, *VL; + Align Alignment; + if (auto *SI = dyn_cast<StoreInst>(Store)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(InVTy->getElementCount()); + VL = isa<FixedVectorType>(InVTy) + ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + } else { + auto *VPStore = cast<VPIntrinsic>(Store); + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + Ptr = VPStore->getMemoryPointerParam(); + Alignment = VPStore->getPointerAlignment().value_or( + DL.getABITypeAlign(InVTy->getElementType())); + + assert(Mask && "vp.store needs a mask!"); - Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + Value *WideEVL = VPStore->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + Type *PtrTy = Ptr->getType(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) + return false; if (isa<FixedVectorType>(InVTy)) { Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); - + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {InVTy, PtrTy, XLenTy}); SmallVector<Value *, 10> Ops(InterleaveValues); - Value *VL = Builder.CreateElementCount(XLenTy, InVTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Ops.append({SI->getPointerOperand(), Mask, VL}); - + Ops.append({Ptr, Mask, VL}); Builder.CreateCall(VssegNFunc, Ops); return true; } unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( - SI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), NumElts * SEW / 8), Factor); - Value *VL = Constant::getAllOnesValue(XLenTy); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) StoredVal = Builder.CreateIntrinsic( @@ -408,131 +442,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), ScalableVssegIntrIds[Factor - 2], + Store->getModule(), ScalableVssegIntrIds[Factor - 2], {VecTupTy, PtrTy, Mask->getType(), VL->getType()}); - Value *Operands[] = {StoredVal, SI->getPointerOperand(), Mask, VL, + Value *Operands[] = {StoredVal, Ptr, Mask, VL, ConstantInt::get(XLenTy, Log2_64(SEW))}; Builder.CreateCall(VssegNFunc, Operands); return true; } -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.vector.deinterleave2.nxv64i8( -/// <vscale x 64 x i8> %l) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef, -/// <vscale x 32 x i8> undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); - - Value *Return = nullptr; - if (isa<FixedVectorType>(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonValue::get(VecTupTy), - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector<Type *, 8> AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): @@ -583,9 +501,9 @@ bool RISCVTargetLowering::lowerInterleavedVPStore( auto *PtrTy = Store->getArgOperand(1)->getType(); auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + Value *EVL = + Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); if (isa<FixedVectorType>(VTy)) { SmallVector<Value *, 8> Operands(InterleaveOperands); diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 05388f2..3e286a7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,6 +13,17 @@ // //===----------------------------------------------------------------------===// +class SMX60IsWorstCaseMX<string mx, list<string> MxList> { + string LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); +} + +class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { + string LLMUL = LargestLMUL<MxList>.r; + int SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -44,6 +55,19 @@ let BufferSize = 0 in { // floating point instructions, this model assumes single issue as // increasing it reduces the gains we saw in performance def SMX60_FP : ProcResource<1>; + + // Vector pipeline + // Single issue for vector store/load instructions + def SMX60_VLS : ProcResource<1>; + + // The C908 user manual says: "Vector floating-point units support vector + // floating-point computation of different bits. In addition, vector integer + // units are added". Developer confirmed it's a separate VIEU + def SMX60_VIEU : ProcResource<1>; + + // The C908 user manual says: "The vector execution unit is developed by + // extending the floating-point unit", so let's assume single issue for now + def SMX60_VFP : ProcResource<1>; } //===----------------------------------------------------------------------===// @@ -232,9 +256,341 @@ let Latency = 4 in { def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>; } +// 6. Configuration-Setting Instructions +def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETVL, [SMX60_IEUA]>; + +// 7. Vector Loads and Stores +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride loads and stores + defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + + // Mask loads and stores + defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + + // Strided and indexed loads and stores + foreach eew = [8, 16, 32, 64] in { + defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + } +} + +// Segmented loads and stores +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride segmented + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Strided/indexed segmented + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Indexed segmented + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + } + } +} + +// Whole register move/load/store +foreach LMul = [1, 2, 4, 8] in { + def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>; + def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>; + + def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; +} + +// 11. Vector Integer Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Vector Integer Division and Remainder +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +// Narrowing Shift and Clips +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 12. Vector Fixed-Point Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 13. Vector Floating-Point Instructions +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c; + + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Narrowing +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Vector Floating-Point Division and Square Root +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 14. Vector Reduction Operations +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListWRed in { + foreach sew = SchedSEWSet<mx, 0, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFWRed in { + foreach sew = SchedSEWSet<mx, 1, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 15. Vector Mask Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 16. Vector Permutation Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; +} + +def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; + +def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; + +// Gather and Compress +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; +} + // Others def : WriteRes<WriteCSR, [SMX60_IEU]>; def : WriteRes<WriteNop, [SMX60_IEU]>; +def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>; def : ReadAdvance<ReadSingleBit, 0>; def : ReadAdvance<ReadSingleBitImm, 0>; +// 6. Configuration-Setting Instructions +def : ReadAdvance<ReadVSETVLI, 0>; +def : ReadAdvance<ReadVSETVL, 0>; + +// 7. Vector Loads and Stores +def : ReadAdvance<ReadVLDX, 0>; +def : ReadAdvance<ReadVSTX, 0>; +defm "" : LMULReadAdvance<"ReadVSTEV", 0>; +defm "" : LMULReadAdvance<"ReadVSTM", 0>; +def : ReadAdvance<ReadVLDSX, 0>; +def : ReadAdvance<ReadVSTSX, 0>; +defm "" : LMULReadAdvance<"ReadVSTS8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS64V", 0>; +defm "" : LMULReadAdvance<"ReadVLDUXV", 0>; +defm "" : LMULReadAdvance<"ReadVLDOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTUXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>; +// LMUL Aware +def : ReadAdvance<ReadVST1R, 0>; +def : ReadAdvance<ReadVST2R, 0>; +def : ReadAdvance<ReadVST4R, 0>; +def : ReadAdvance<ReadVST8R, 0>; + +// 12. Vector Integer Arithmetic Instructions +defm : LMULReadAdvance<"ReadVIALUV", 0>; +defm : LMULReadAdvance<"ReadVIALUX", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUV", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUX", 0>; +defm : LMULReadAdvance<"ReadVExtV", 0>; +defm : LMULReadAdvance<"ReadVICALUV", 0>; +defm : LMULReadAdvance<"ReadVICALUX", 0>; +defm : LMULReadAdvance<"ReadVShiftV", 0>; +defm : LMULReadAdvance<"ReadVShiftX", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftV", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftX", 0>; +defm : LMULReadAdvance<"ReadVICmpV", 0>; +defm : LMULReadAdvance<"ReadVICmpX", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxV", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxX", 0>; +defm : LMULReadAdvance<"ReadVIMulV", 0>; +defm : LMULReadAdvance<"ReadVIMulX", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivV", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulX", 0>; +defm : LMULReadAdvance<"ReadVIMulAddV", 0>; +defm : LMULReadAdvance<"ReadVIMulAddX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>; +defm : LMULReadAdvance<"ReadVIMergeV", 0>; +defm : LMULReadAdvance<"ReadVIMergeX", 0>; +defm : LMULReadAdvance<"ReadVIMovV", 0>; +defm : LMULReadAdvance<"ReadVIMovX", 0>; + +// 13. Vector Fixed-Point Arithmetic Instructions +defm "" : LMULReadAdvance<"ReadVSALUV", 0>; +defm "" : LMULReadAdvance<"ReadVSALUX", 0>; +defm "" : LMULReadAdvance<"ReadVAALUV", 0>; +defm "" : LMULReadAdvance<"ReadVAALUX", 0>; +defm "" : LMULReadAdvance<"ReadVSMulV", 0>; +defm "" : LMULReadAdvance<"ReadVSMulX", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftV", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftX", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>; + +// 14. Vector Floating-Point Instructions +defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>; +defm "" : LMULReadAdvance<"ReadVFClassV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeF", 0>; +defm "" : LMULReadAdvance<"ReadVFMovF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>; +defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; +defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; +defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; + +// 15. Vector Reduction Operations +def : ReadAdvance<ReadVIRedV, 0>; +def : ReadAdvance<ReadVIRedV0, 0>; +def : ReadAdvance<ReadVIWRedV, 0>; +def : ReadAdvance<ReadVIWRedV0, 0>; +def : ReadAdvance<ReadVFRedV, 0>; +def : ReadAdvance<ReadVFRedV0, 0>; +def : ReadAdvance<ReadVFRedOV, 0>; +def : ReadAdvance<ReadVFRedOV0, 0>; +def : ReadAdvance<ReadVFWRedV, 0>; +def : ReadAdvance<ReadVFWRedV0, 0>; +def : ReadAdvance<ReadVFWRedOV, 0>; +def : ReadAdvance<ReadVFWRedOV0, 0>; + +// 16. Vector Mask Instructions +defm "" : LMULReadAdvance<"ReadVMALUV", 0>; +defm "" : LMULReadAdvance<"ReadVMPopV", 0>; +defm "" : LMULReadAdvance<"ReadVMFFSV", 0>; +defm "" : LMULReadAdvance<"ReadVMSFSV", 0>; +defm "" : LMULReadAdvance<"ReadVIotaV", 0>; + +// 17. Vector Permutation Instructions +def : ReadAdvance<ReadVMovXS, 0>; +def : ReadAdvance<ReadVMovSX_V, 0>; +def : ReadAdvance<ReadVMovSX_X, 0>; +def : ReadAdvance<ReadVMovFS, 0>; +def : ReadAdvance<ReadVMovSF_V, 0>; +def : ReadAdvance<ReadVMovSF_F, 0>; +defm "" : LMULReadAdvance<"ReadVISlideV", 0>; +defm "" : LMULReadAdvance<"ReadVISlideX", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideV", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideF", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>; +// LMUL Aware +def : ReadAdvance<ReadVMov1V, 0>; +def : ReadAdvance<ReadVMov2V, 0>; +def : ReadAdvance<ReadVMov4V, 0>; +def : ReadAdvance<ReadVMov8V, 0>; + +// Others +def : ReadAdvance<ReadVMask, 0>; +def : ReadAdvance<ReadVPassthru_WorstCase, 0>; +foreach mx = SchedMxList in { + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>; + foreach sew = SchedSEWSet<mx>.val in + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>; +} + //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index c2b5e01..e656e8b 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { return TwoTimes ? MILog2SEW + 1 : MILog2SEW; } + // Vector Register Gather with 16-bit Index Elements Instruction + // Dest and source data EEW=SEW. Index vector EEW=16. + case RISCV::VRGATHEREI16_VV: { + if (MO.getOperandNo() == 2) + return 4; + return MILog2SEW; + } + default: return std::nullopt; } @@ -1058,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSLIDEDOWN_VI: case RISCV::VSLIDE1UP_VX: case RISCV::VFSLIDE1UP_VF: + // Vector Register Gather Instructions + case RISCV::VRGATHER_VI: + case RISCV::VRGATHER_VV: + case RISCV::VRGATHER_VX: + case RISCV::VRGATHEREI16_VV: // Vector Single-Width Floating-Point Add/Subtract Instructions case RISCV::VFADD_VF: case RISCV::VFADD_VV: diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 6897865..ea78dcd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>; defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>; defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>; -defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>; // GetQuery builtin records: defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>; @@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index fd0bea0..6608b3f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); + case Intrinsic::spv_refract: + return selectExtInst(ResVReg, ResType, I, GL::Refract); case Intrinsic::spv_reflect: return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f..82e8ce4e 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + assert(PendingMembers.size() == 4 && "Should have two parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa..f020e0b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a3..2636979 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,7 +1661,7 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad3553..b4639ac 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3..360293bc 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, |