diff options
Diffstat (limited to 'llvm/lib/Target')
27 files changed, 491 insertions, 319 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 201bfe0..d6a3d59 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1236,14 +1236,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, .add(MI.getOperand(3)); transferImpOps(MI, I, I); } else { + unsigned RegState = + getRenamableRegState(MI.getOperand(1).isRenamable()) | + getKillRegState( + MI.getOperand(1).isKill() && + MI.getOperand(1).getReg() != MI.getOperand(2).getReg() && + MI.getOperand(1).getReg() != MI.getOperand(3).getReg()); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 : AArch64::ORRv16i8)) .addReg(DstReg, RegState::Define | getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(1)) - .add(MI.getOperand(1)); + .addReg(MI.getOperand(1).getReg(), RegState) + .addReg(MI.getOperand(1).getReg(), RegState); auto I2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index d068a12..b033f88 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7362,7 +7362,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc, V128, V128, V128, - asm#"2", ".8h", ".16b", ".16b", []>; + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>; let Predicates = [HasAES] in { def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc, V128, V64, V64, @@ -7374,10 +7376,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)), (extract_high_v2i64 (v2i64 V128:$Rm))))]>; } - - def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), - (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), - (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, @@ -7402,6 +7400,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } +let isCommutable = 1 in multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, @@ -7483,6 +7482,7 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>; } +let isCommutable = 1 in multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ac31236..8cfbff9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6055,6 +6055,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; +let isCommutable = 1 in defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >; @@ -6806,6 +6807,7 @@ defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn> defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +let isCommutable = 1 in defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; @@ -6822,6 +6824,7 @@ defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>; defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>; +let isCommutable = 0 in defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", @@ -6836,6 +6839,7 @@ defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; +let isCommutable = 0 in defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index f136a184..a67bd42 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { ClMaxLifetimes); if (StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; - uint64_t Size = - cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); + uint64_t Size = *Info.AI->getAllocationSize(*DL); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), TagPCall, Size); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9f05add..5c94aeb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) { VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; } -static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { +static InstructionCost getHistogramCost(const AArch64Subtarget *ST, + const IntrinsicCostAttributes &ICA) { + // We need to know at least the number of elements in the vector of buckets + // and the size of each element to update. + if (ICA.getArgTypes().size() < 2) + return InstructionCost::getInvalid(); + + // Only interested in costing for the hardware instruction from SVE2. + if (!ST->hasSVE2()) + return InstructionCost::getInvalid(); + Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements unsigned TotalHistCnts = 1; @@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; TotalHistCnts = EC / NaturalVectorWidth; + + return InstructionCost(BaseHistCntCost * TotalHistCnts); } - return InstructionCost(BaseHistCntCost * TotalHistCnts); + return InstructionCost::getInvalid(); } InstructionCost @@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return InstructionCost::getInvalid(); switch (ICA.getID()) { - case Intrinsic::experimental_vector_histogram_add: - if (!ST->hasSVE2()) - return InstructionCost::getInvalid(); - return getHistogramCost(ICA); + case Intrinsic::experimental_vector_histogram_add: { + InstructionCost HistCost = getHistogramCost(ST, ICA); + // If the cost isn't valid, we may still be able to scalarize + if (HistCost.isValid()) + return HistCost; + break; + } case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: @@ -3975,6 +3990,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; } +std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost( + Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const { + if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy()) + return std::nullopt; + if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16()) + return std::nullopt; + + Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext())); + InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; + Cost += InstCost(PromotedTy); + if (IncludeTrunc) + Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy, + TTI::CastContextHint::None, CostKind); + return Cost; +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, @@ -3997,6 +4033,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + // Increase the cost for half and bfloat types if not architecturally + // supported. + if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL || + ISD == ISD::FDIV || ISD == ISD::FREM) + if (auto PromotedCost = getFP16BF16PromoteCost( + Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true, + [&](Type *PromotedTy) { + return getArithmeticInstrCost(Opcode, PromotedTy, CostKind, + Op1Info, Op2Info); + })) + return *PromotedCost; + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4265,11 +4313,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( [[fallthrough]]; case ISD::FADD: case ISD::FSUB: - // Increase the cost for half and bfloat types if not architecturally - // supported. - if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || - (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) - return 2 * LT.first; if (!Ty->getScalarType()->isFP128Ty()) return LT.first; [[fallthrough]]; @@ -4371,25 +4414,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( } if (Opcode == Instruction::FCmp) { - // Without dedicated instructions we promote f16 + bf16 compares to f32. - if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || - ValTy->getScalarType()->isBFloatTy()) { - Type *PromotedTy = - ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); - InstructionCost Cost = - getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - if (!Op1Info.isConstant() && !Op2Info.isConstant()) - Cost *= 2; - Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, - Op1Info, Op2Info); - if (ValTy->isVectorTy()) - Cost += getCastInstrCost( - Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), - VectorType::getInteger(cast<VectorType>(PromotedTy)), - TTI::CastContextHint::None, CostKind); - return Cost; - } + if (auto PromotedCost = getFP16BF16PromoteCost( + ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false, + [&](Type *PromotedTy) { + InstructionCost Cost = + getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, + CostKind, Op1Info, Op2Info); + if (isa<VectorType>(PromotedTy)) + Cost += getCastInstrCost( + Instruction::Trunc, + VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); + return Cost; + })) + return *PromotedCost; auto LT = getTypeLegalizationCost(ValTy); // Model unknown fp compares as a libcall. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7f45177..fa9b25a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -435,6 +435,14 @@ public: bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); } + /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the + /// architecture features are not present. + std::optional<InstructionCost> + getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const; + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2a324e5..626734a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const Function &F = MF.getFunction(); // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave - // dispatch registers are function args. - unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - - if (isShader(F.getCallingConv())) { - bool IsPixelShader = - F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); - - // Calculate the number of VGPR registers based on the SPI input registers - uint32_t InputEna = 0; - uint32_t InputAddr = 0; - unsigned LastEna = 0; - - if (IsPixelShader) { - // Note for IsPixelShader: - // By this stage, all enabled inputs are tagged in InputAddr as well. - // We will use InputAddr to determine whether the input counts against the - // vgpr total and only use the InputEnable to determine the last input - // that is relevant - if extra arguments are used, then we have to honour - // the InputAddr for any intermediate non-enabled inputs. - InputEna = MFI->getPSInputEnable(); - InputAddr = MFI->getPSInputAddr(); - - // We only need to consider input args up to the last used arg. - assert((InputEna || InputAddr) && - "PSInputAddr and PSInputEnable should " - "never both be 0 for AMDGPU_PS shaders"); - // There are some rare circumstances where InputAddr is non-zero and - // InputEna can be set to 0. In this case we default to setting LastEna - // to 1. - LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; - } + // dispatch registers as function args. + unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(), + WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs(); - // FIXME: We should be using the number of registers determined during - // calling convention lowering to legalize the types. - const DataLayout &DL = F.getDataLayout(); - unsigned PSArgCount = 0; - unsigned IntermediateVGPR = 0; - for (auto &Arg : F.args()) { - unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) { - WaveDispatchNumSGPR += NumRegs; - } else { - // If this is a PS shader and we're processing the PS Input args (first - // 16 VGPR), use the InputEna and InputAddr bits to define how many - // VGPRs are actually used. - // Any extra VGPR arguments are handled as normal arguments (and - // contribute to the VGPR count whether they're used or not). - if (IsPixelShader && PSArgCount < 16) { - if ((1 << PSArgCount) & InputAddr) { - if (PSArgCount < LastEna) - WaveDispatchNumVGPR += NumRegs; - else - IntermediateVGPR += NumRegs; - } - PSArgCount++; - } else { - // If there are extra arguments we have to include the allocation for - // the non-used (but enabled with InputAddr) input arguments - if (IntermediateVGPR) { - WaveDispatchNumVGPR += IntermediateVGPR; - IntermediateVGPR = 0; - } - WaveDispatchNumVGPR += NumRegs; - } - } - } + if (WaveDispatchNumSGPR) { ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + {ProgInfo.NumSGPR, + MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs, + Ctx)}, + Ctx); + } + if (WaveDispatchNumVGPR) { ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 3d8d274..64a9bde 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( ++i; } + if (Info->getNumKernargPreloadedSGPRs()) + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; @@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!determineAssignments(Assigner, SplitArgs, CCInfo)) return false; + if (IsEntryFunc) { + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } + FormalArgHandler Handler(B, MRI); if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index f580f43..c21a9a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -109,12 +109,17 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // Find AV_* registers assigned to AGPRs. const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.isVectorSuperClass(VirtRegRC)) + if (!TRI.hasAGPRs(VirtRegRC)) continue; - const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; + const TargetRegisterClass *AssignedRC = VirtRegRC; + if (TRI.hasVGPRs(VirtRegRC)) { + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + if (!TRI.isAGPRClass(AssignedRC)) + continue; + } LiveInterval &LI = LIS.getInterval(VReg); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b327fb..1b7d65a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f..49425d5 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), + NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), Occupancy(MFI.getOccupancy()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), @@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs; + NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206..ca8f803 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool WaveLimiter = false; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; + uint16_t NumWaveDispatchSGPRs = 0; + uint16_t NumWaveDispatchVGPRs = 0; uint32_t HighBitsOf32BitAddress = 0; // TODO: 10 may be a better default since it's the maximum. @@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); + YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false); + YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, @@ -465,6 +469,9 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumWaveDispatchSGPRs = 0; + unsigned NumWaveDispatchVGPRs = 0; + bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; @@ -991,6 +998,14 @@ public: return UserSGPRInfo.getNumKernargPreloadSGPRs(); } + unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; } + + void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; } + + unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; } + + void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ea99cc4..75d3cfa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::BSWAP, VT, Expand); } + if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::SCMP, MVT::i32, Custom); + + if (!Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::UCMP, MVT::i32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); @@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } +bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { + return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op, return DAG.getBitcast(MVT::i32, Res); } +SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + // Determine if this is signed or unsigned comparison + bool IsSigned = (Op.getOpcode() == ISD::SCMP); + + // Special case for Thumb1 UCMP only + if (!IsSigned && Subtarget->isThumb1Only()) { + // For Thumb unsigned comparison, use this sequence: + // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags + // sbc r2, r2 ; r2 = r2 - r2 - !carry + // cmp r1, r0 ; compare RHS with LHS + // sbc r1, r1 ; r1 = r1 - r1 - !carry + // subs r0, r2, r1 ; r0 = r2 - r1 (final result) + + // First subtraction: LHS - RHS + SDValue Sub1WithFlags = DAG.getNode( + ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + SDValue Sub1Result = Sub1WithFlags.getValue(0); + SDValue Flags1 = Sub1WithFlags.getValue(1); + + // SUBE: Sub1Result - Sub1Result - !carry + // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned) + SDValue Sbc1 = + DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), + Sub1Result, Sub1Result, Flags1); + SDValue Sbc1Result = Sbc1.getValue(0); + + // Second comparison: RHS vs LHS (reverse comparison) + SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS); + + // SUBE: RHS - RHS - !carry + // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned) + SDValue Sbc2 = DAG.getNode( + ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags); + SDValue Sbc2Result = Sbc2.getValue(0); + + // Final subtraction: Sbc1Result - Sbc2Result (no flags needed) + SDValue Result = + DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result); + if (Op.getValueType() != MVT::i32) + Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); + + return Result; + } + + // For the ARM assembly pattern: + // subs r0, r0, r1 ; subtract RHS from LHS and set flags + // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for + // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for + // signed, LO for unsigned) + // ; if LHS == RHS, result remains 0 from the subs + + // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC + unsigned Opcode = ARMISD::SUBC; + + // Check if RHS is a subtraction against 0: (0 - X) + if (RHS.getOpcode() == ISD::SUB) { + SDValue SubLHS = RHS.getOperand(0); + SDValue SubRHS = RHS.getOperand(1); + + // Check if it's 0 - X + if (isNullConstant(SubLHS)) { + bool CanUseAdd = false; + if (IsSigned) { + // For SCMP: only if X is known to never be INT_MIN (to avoid overflow) + if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS) + .getSignedMinValue() + .isMinSignedValue()) { + CanUseAdd = true; + } + } else { + // For UCMP: only if X is known to never be zero + if (DAG.isKnownNeverZero(SubRHS)) { + CanUseAdd = true; + } + } + + if (CanUseAdd) { + Opcode = ARMISD::ADDC; + RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of + // LHS - (0 - X) + } + } + } + + // Generate the operation with flags + SDValue OpWithFlags; + if (Opcode == ARMISD::ADDC) { + // Use ADDC: LHS + RHS (where RHS was 0 - X, now X) + OpWithFlags = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } else { + // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags) + OpWithFlags = DAG.getNode(ARMISD::SUBC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } + + SDValue OpResult = OpWithFlags.getValue(0); // The operation result + SDValue Flags = OpWithFlags.getValue(1); // The flags + + // Constants for conditional moves + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32); + + // Select condition codes based on signed vs unsigned + ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI; + ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO; + + // First conditional move: if greater than, set to 1 + SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32); + SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One, + GTCondValue, Flags); + + // Second conditional move: if less than, set to -1 + SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32); + SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, + LTCondValue, Flags); + + if (Op.getValueType() != MVT::i32) + Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); + + return Result2; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::UCMP: + case ISD::SCMP: + return LowerCMP(Op, DAG); } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d..a84a3cb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -607,6 +607,8 @@ class VectorType; bool preferZeroCompareBranch() const override { return true; } + bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { @@ -904,6 +906,7 @@ class VectorType; void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index fda9d97..ca5d27d 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -254,7 +254,8 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); + if (!F.getParent()->isLinkerRelaxable()) + F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder()); return true; } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 76dca47..f123040 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SpillsKnownBit = true; break; default: + // When spilling a CR bit, the super register may not be explicitly defined + // (i.e. it can be defined by a CR-logical that only defines the subreg) so + // we state that the CR field is undef. Also, in order to preserve the kill + // flag on the CR bit, we add it as an implicit use. + // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all // bits (specifically, it produces a -1 if the CR bit is set). Ultimately, // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit // register), and SETNBC will set this. if (Subtarget.isISA3_1()) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg) - .addReg(SrcReg, RegState::Undef); + .addReg(SrcReg, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } @@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT || SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg) - .addReg(getCRFromCRBit(SrcReg), RegState::Undef); + .addReg(getCRFromCRBit(SrcReg), RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } } // We need to move the CR field that contains the CR bit we are spilling. - // The super register may not be explicitly defined (i.e. it can be defined - // by a CR-logical that only defines the subreg) so we state that the CR - // field is undef. Also, in order to preserve the kill flag on the CR bit, - // we add it as an implicit use. BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) .addReg(getCRFromCRBit(SrcReg), RegState::Undef) .addReg(SrcReg, diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 67cc01e..e0ac591 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -674,6 +674,9 @@ static constexpr FeatureBitset XAndesGroup = { static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions + {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, + {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, + {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, {DecoderTableXVentana32, {RISCV::FeatureVendorXVentanaCondOps}, "XVentanaCondOps"}, @@ -690,9 +693,6 @@ static constexpr DecoderListEntry DecoderList32[]{ "MIPS mips.pref"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, // Standard Extensions - {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, - {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, - {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, {DecoderTable32, {}, "standard 32-bit instructions"}, {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"}, {DecoderTableZfinx32, {}, "Zfinx (Float in Integer)"}, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index a997ea5..8d956ce 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -32,6 +32,11 @@ static cl::opt<bool> ULEB128Reloc( "riscv-uleb128-reloc", cl::init(true), cl::Hidden, cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate")); +static cl::opt<bool> + AlignRvc("riscv-align-rvc", cl::init(true), cl::Hidden, + cl::desc("When generating R_RISCV_ALIGN, insert $alignment-2 " + "bytes of NOPs even in norvc code")); + RISCVAsmBackend::RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit, const MCTargetOptions &Options) : MCAsmBackend(llvm::endianness::little), STI(STI), OSABI(OSABI), @@ -306,12 +311,21 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, // If conditions are met, compute the padding size and create a fixup encoding // the padding size in the addend. bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { - // Use default handling unless linker relaxation is enabled and the alignment - // is larger than the nop size. - const MCSubtargetInfo *STI = F.getSubtargetInfo(); - if (!STI->hasFeature(RISCV::FeatureRelax)) + // Alignments before the first linker-relaxable instruction have fixed sizes + // and do not require relocations. Alignments after a linker-relaxable + // instruction require a relocation, even if the STI specifies norelax. + // + // firstLinkerRelaxable is the layout order within the subsection, which may + // be smaller than the section's order. Therefore, alignments in a + // lower-numbered subsection may be unnecessarily treated as linker-relaxable. + auto *Sec = F.getParent(); + if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable()) return false; - unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; + + // Use default handling unless the alignment is larger than the nop size. + const MCSubtargetInfo *STI = F.getSubtargetInfo(); + unsigned MinNopLen = + AlignRvc || STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; if (F.getAlignment() <= MinNopLen) return false; @@ -321,7 +335,6 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); return true; } @@ -474,8 +487,9 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, // TODO: emit a mapping symbol right here if (Count % 4 == 2) { - // The canonical nop with Zca is c.nop. - OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2); + // The canonical nop with Zca is c.nop. For .balign 4, we generate a 2-byte + // c.nop even in a norvc region. + OS.write("\x01\0", 2); Count -= 2; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 5541506..24ebbc3 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -524,16 +524,33 @@ foreach mx = SchedMxListW in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c; + // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c; + // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites + let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in { + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + } + + defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in { + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 13. Vector Floating-Point Instructions diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 05d504c..6a1f4b3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -114,6 +114,9 @@ public: bool enableScalableVectorization() const override { return ST->hasVInstructions(); } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override { + return ST->hasVInstructions(); + } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 74aec4f..2b34f61 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) { } } -static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID, - ArrayRef<unsigned> OpNos) { - Function *F = nullptr; - if (OpNos.empty()) { - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID); - } else { - SmallVector<Type *, 4> Tys; - for (unsigned OpNo : OpNos) - Tys.push_back(II->getOperand(OpNo)->getType()); - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys); - } - II->setCalledFunction(F); +static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) { + IRBuilder<> Builder(II); + auto *Alloca = cast<AllocaInst>(II->getArgOperand(0)); + std::optional<TypeSize> Size = + Alloca->getAllocationSize(Alloca->getDataLayout()); + Value *SizeVal = Builder.getInt64(Size ? *Size : -1); + Builder.CreateIntrinsic(NewID, Alloca->getType(), + {SizeVal, II->getArgOperand(0)}); + II->eraseFromParent(); return true; } @@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_start: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_start); } else { II->eraseFromParent(); Changed = true; @@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_end: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_end); } else { II->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index f32c9bd..2611c29 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess( C2.ScaleCost, C2.SetupCost); } -bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - // Support only equal feature bitsets. Restriction should be relaxed in the - // future to allow inlining when callee's bits are subset of the caller's. - return CallerBits == CalleeBits; -} - unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); if (!Vector) diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index dc5736e..fc681de 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -65,9 +65,6 @@ public: bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override; - bool areInlineCompatible(const Function *Caller, - const Function *Callee) const override; - /// @} /// \name Vector TTI Implementations |