diff options
Diffstat (limited to 'llvm/lib/Target/RISCV')
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 122 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVISelLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp | 265 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp | 22 |
4 files changed, 172 insertions, 242 deletions
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 6c8e3da..23b4554 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = { /* -21, -22, -23, -24 are reserved */ }; +/// Returns true if DWARF CFI instructions ("frame moves") should be emitted. +static bool needsDwarfCFI(const MachineFunction &MF) { + return MF.needsFrameMoves(); +} + // For now we use x3, a.k.a gp, as pointer to shadow call stack. // User should not use x3 in their asm. static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, @@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameSetup); + if (!needsDwarfCFI(MF)) + return; + // Emit a CFI instruction that causes SlotSize to be subtracted from the value // of the shadow stack pointer when unwinding past this frame. char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true); @@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(SCSPReg) .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameDestroy); - // Restore the SCS pointer - CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + if (needsDwarfCFI(MF)) { + // Restore the SCS pointer + CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + } } // Insert instruction to swap mscratchsw with sp @@ -935,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + getUnmanagedCSI(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + bool NeedsDwarfCFI = needsDwarfCFI(MF); // If libcalls are used to spill and restore callee-saved registers, the frame // has two sections; the opaque section managed by the libcalls, and the @@ -962,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign()); RVFI->setLibCallStackSize(LibCallFrameSize); - CFIBuilder.buildDefCFAOffset(LibCallFrameSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(LibCallFrameSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // FIXME (note copied from Lanai): This appears to be overallocating. Needs @@ -996,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // could only be the next instruction. ++PossiblePush; - // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` - // could be. The PUSH will also get its own CFI metadata for its own - // modifications, which should come after the PUSH. - CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup); - PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); - for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) - PushCFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` + // could be. The PUSH will also get its own CFI metadata for its own + // modifications, which should come after the PUSH. + CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, + MachineInstr::FrameSetup); + PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); + for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) + PushCFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } if (RVFI->isPushable(MF) && PossiblePush != MBB.end() && @@ -1017,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, PossiblePush->getOperand(1).setImm(StackAdj); StackSize -= StackAdj; - CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // Allocate space on the stack if necessary. @@ -1031,7 +1049,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); if (StackSize != 0) - allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true, + allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI, NeedProbe, ProbeSize, DynAllocation, MachineInstr::FrameSetup); @@ -1049,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); // Generate new FP. if (hasFP(MF)) { @@ -1069,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); + if (NeedsDwarfCFI) + CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); } uint64_t SecondSPAdjustAmount = 0; @@ -1080,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, "SecondSPAdjustAmount should be greater than zero"); allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, - getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe, - ProbeSize, DynAllocation, MachineInstr::FrameSetup); + getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF), + NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); } if (RVVStackSize) { if (NeedProbe) { allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, - MachineInstr::FrameSetup, !hasFP(MF), - DynAllocation); + MachineInstr::FrameSetup, + NeedsDwarfCFI && !hasFP(MF), DynAllocation); } else { // We must keep the stack pointer aligned through any intermediate // updates. @@ -1097,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - if (!hasFP(MF)) { + if (NeedsDwarfCFI && !hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". CFIBuilder.insertCFIInst(createDefCFAExpression( *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8)); } std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); - emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); + if (NeedsDwarfCFI) + emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); } if (hasFP(MF)) { @@ -1171,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF, MachineInstr::FrameDestroy, getStackAlign()); StackSize = 0; - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) - .buildDefCFAOffset(CFAOffset); + if (needsDwarfCFI(MF)) + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) + .buildDefCFAOffset(CFAOffset); } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1212,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn, MachineInstr::FrameDestroy); + bool NeedsDwarfCFI = needsDwarfCFI(MF); uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount @@ -1232,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getScalable(RVVStackSize), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) - CFIBuilder.buildDefCFA(SPReg, RealStackSize); - - emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + if (NeedsDwarfCFI) { + if (!hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); + emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + } } if (FirstSPAdjustAmount) { @@ -1251,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(SecondSPAdjustAmount), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) + if (NeedsDwarfCFI && !hasFP(MF)) CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount); } @@ -1272,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, getStackAlign()); } - if (hasFP(MF)) + if (NeedsDwarfCFI && hasFP(MF)) CFIBuilder.buildDefCFA(SPReg, RealStackSize); // Skip to after the restores of scalar callee-saved registers @@ -1295,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, } // Recover callee-saved registers. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) { // Use available stack adjustment in pop instruction to deallocate stack @@ -1315,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto NextI = next_nodbg(MBBI, MBB.end()); if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) { ++MBBI; - CFIBuilder.setInsertPoint(MBBI); + if (NeedsDwarfCFI) { + CFIBuilder.setInsertPoint(MBBI); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); - // Update CFA Offset. If this is a QCI interrupt function, there will be a - // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise - // getQCIInterruptStackSize() will be 0. - CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + // Update CFA Offset. If this is a QCI interrupt function, there will + // be a leftover offset which is deallocated by `QC.C.MILEAVERET`, + // otherwise getQCIInterruptStackSize() will be 0. + CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + } } } @@ -1812,7 +1841,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( // allocateStack. bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); - allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF), + allocateStack(MBB, MI, MF, -Amount, -Amount, + needsDwarfCFI(MF) && !hasFP(MF), /*NeedProbe=*/true, ProbeSize, DynAllocation, MachineInstr::NoFlags); } else { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a5d735c..e0a8c07 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ public: bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; @@ -444,9 +444,6 @@ public: Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const override; - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveRes) const override; - bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef<Value *> InterleaveOps) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 0d4f241..dd68a55 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -102,6 +102,56 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { return false; } +/// Do the common operand retrieval and validition required by the +/// routines below. +static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, + Instruction *I, Value *&Ptr, Value *&Mask, + Value *&VL, Align &Alignment) { + + IRBuilder<> Builder(I); + const DataLayout &DL = I->getDataLayout(); + ElementCount EC = VTy->getElementCount(); + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load"); + Mask = Builder.getAllOnesMask(EC); + VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + if (auto *SI = dyn_cast<StoreInst>(I)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(EC); + VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + auto *VPLdSt = cast<VPIntrinsic>(I); + assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load || + VPLdSt->getIntrinsicID() == Intrinsic::vp_store) && + "Unexpected intrinsic"); + Ptr = VPLdSt->getMemoryPointerParam(); + Alignment = VPLdSt->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); + + assert(Mask && "vp.load and vp.store needs a mask!"); + + Value *WideEVL = VPLdSt->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + return true; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -115,21 +165,25 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -138,26 +192,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -271,34 +322,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Value *Ptr, *VL; Align Alignment; - if (auto *LI = dyn_cast<LoadInst>(Load)) { - assert(LI->isSimple()); - Ptr = LI->getPointerOperand(); - Alignment = LI->getAlign(); - assert(!Mask && "Unexpected mask on a load\n"); - Mask = Builder.getAllOnesMask(ResVTy->getElementCount()); - VL = isa<FixedVectorType>(ResVTy) - ? Builder.CreateElementCount(XLenTy, ResVTy->getElementCount()) - : Constant::getAllOnesValue(XLenTy); - } else { - auto *VPLoad = cast<VPIntrinsic>(Load); - assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - Ptr = VPLoad->getMemoryPointerParam(); - Alignment = VPLoad->getPointerAlignment().value_or( - DL.getABITypeAlign(ResVTy->getElementType())); - - assert(Mask && "vp.load needs a mask!"); - - Value *WideEVL = VPLoad->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - } + if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); @@ -360,34 +385,8 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Value *Ptr, *VL; Align Alignment; - if (auto *SI = dyn_cast<StoreInst>(Store)) { - assert(SI->isSimple()); - Ptr = SI->getPointerOperand(); - Alignment = SI->getAlign(); - assert(!Mask && "Unexpected mask on a store"); - Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - VL = isa<FixedVectorType>(InVTy) - ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount()) - : Constant::getAllOnesValue(XLenTy); - } else { - auto *VPStore = cast<VPIntrinsic>(Store); - assert(VPStore->getIntrinsicID() == Intrinsic::vp_store && - "Unexpected intrinsic"); - Ptr = VPStore->getMemoryPointerParam(); - Alignment = VPStore->getPointerAlignment().value_or( - DL.getABITypeAlign(InVTy->getElementType())); - - assert(Mask && "vp.store needs a mask!"); - - Value *WideEVL = VPStore->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, DL, Factor)) - return false; - - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - } + if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment)) + return false; Type *PtrTy = Ptr->getType(); unsigned AS = Ptr->getType()->getPointerAddressSpace(); if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) @@ -426,122 +425,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.vector.deinterleave2.nxv64i8( -/// <vscale x 64 x i8> %l) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef, -/// <vscale x 32 x i8> undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - Value *EVL = - Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - - Value *Return = nullptr; - if (isa<FixedVectorType>(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonValue::get(VecTupTy), - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector<Type *, 8> AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp index c1f4d19..3bd2705 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp @@ -10,6 +10,10 @@ // instructions and masked instructions, so that we can reduce the live range // overlaps of mask registers. // +// If there are multiple masks producers followed by multiple masked +// instructions, then at each masked instructions add dependency edges between +// every producer and masked instruction. +// // The reason why we need to do this: // 1. When tracking register pressure, we don't track physical registers. // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't @@ -67,11 +71,27 @@ public: void apply(ScheduleDAGInstrs *DAG) override { SUnit *NearestUseV0SU = nullptr; + SmallVector<SUnit *, 2> DefMask; for (SUnit &SU : DAG->SUnits) { const MachineInstr *MI = SU.getInstr(); - if (MI->findRegisterUseOperand(RISCV::V0, TRI)) + bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI); + if (isSoleUseCopyToV0(SU) && !UseV0) + DefMask.push_back(&SU); + + if (UseV0) { NearestUseV0SU = &SU; + // Copy may not be a real use, so skip it here. + if (DefMask.size() > 1 && !MI->isCopy()) { + for (SUnit *Def : DefMask) + if (DAG->canAddEdge(Def, &SU)) + DAG->addEdge(Def, SDep(&SU, SDep::Artificial)); + } + + if (!DefMask.empty()) + DefMask.erase(DefMask.begin()); + } + if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) && // For LMUL=8 cases, there will be more possibilities to spill. // FIXME: We should use RegPressureTracker to do fine-grained |