diff options
Diffstat (limited to 'llvm/lib/Target')
22 files changed, 188 insertions, 98 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 8d6eb91..4357264d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -282,7 +282,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", static cl::opt<bool> SplitSVEObjects("aarch64-split-sve-objects", cl::desc("Split allocation of ZPR & PPR objects"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); cl::opt<bool> EnableHomogeneousPrologEpilog( "homogeneous-prolog-epilog", cl::Hidden, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 36c9cb6..bc6b931 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1010,6 +1010,36 @@ let Predicates = [HasSVE_or_SME] in { defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>; + + // mul x (splat -1) -> neg x + def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))), + (NEG_ZPmZ_B $Op2, $Op1, $Op2)>; + def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))), + (NEG_ZPmZ_H $Op2, $Op1, $Op2)>; + def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))), + (NEG_ZPmZ_S $Op2, $Op1, $Op2)>; + def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))), + (NEG_ZPmZ_D $Op2, $Op1, $Op2)>; + + let AddedComplexity = 5 in { + def : Pat<(nxv16i8 (AArch64mul_p nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))), + (NEG_ZPmZ_B_UNDEF $Op2, $Op1, $Op2)>; + def : Pat<(nxv8i16 (AArch64mul_p nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))), + (NEG_ZPmZ_H_UNDEF $Op2, $Op1, $Op2)>; + def : Pat<(nxv4i32 (AArch64mul_p nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))), + (NEG_ZPmZ_S_UNDEF $Op2, $Op1, $Op2)>; + def : Pat<(nxv2i64 (AArch64mul_p nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))), + (NEG_ZPmZ_D_UNDEF $Op2, $Op1, $Op2)>; + } + + def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, (nxv16i8 (splat_vector (i32 -1))), nxv16i8:$Op2)), + (NEG_ZPmZ_B (DUP_ZI_B -1, 0), $Op1, $Op2)>; + def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, (nxv8i16 (splat_vector (i32 -1))), nxv8i16:$Op2)), + (NEG_ZPmZ_H (DUP_ZI_H -1, 0), $Op1, $Op2)>; + def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, (nxv4i32 (splat_vector (i32 -1))), nxv4i32:$Op2)), + (NEG_ZPmZ_S (DUP_ZI_S -1, 0), $Op1, $Op2)>; + def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, (nxv2i64 (splat_vector (i64 -1))), nxv2i64:$Op2)), + (NEG_ZPmZ_D (DUP_ZI_D -1, 0), $Op1, $Op2)>; } // End HasSVE_or_SME // COMPACT - word and doubleword diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 8c4b4f6..50a8754 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5632,75 +5632,94 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, TTI::TargetCostKind CostKind) const { InstructionCost Invalid = InstructionCost::getInvalid(); - InstructionCost Cost(TTI::TCC_Basic); if (CostKind != TTI::TCK_RecipThroughput) return Invalid; - // Sub opcodes currently only occur in chained cases. - // Independent partial reduction subtractions are still costed as an add + if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() && + (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || OpAExtend == TTI::PR_None) return Invalid; + assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && + (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) && + "Unexpected values for OpBExtend or InputTypeB"); + // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. - // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but - // only if the i8mm or sve/streaming features are available. - if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB || - OpBExtend == TTI::PR_None || - (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && - !ST->isSVEorStreamingSVEAvailable()))) + if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) return Invalid; - assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && - "Unexpected values for OpBExtend or InputTypeB"); - EVT InputEVT = EVT::getEVT(InputTypeA); - EVT AccumEVT = EVT::getEVT(AccumType); + bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend; + if (IsUSDot && !ST->hasMatMulInt8()) + return Invalid; + + unsigned Ratio = + AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits(); + if (VF.getKnownMinValue() <= Ratio) + return Invalid; + + VectorType *InputVectorType = VectorType::get(InputTypeA, VF); + VectorType *AccumVectorType = + VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); + // We don't yet support all kinds of legalization. + auto TA = TLI->getTypeAction(AccumVectorType->getContext(), + EVT::getEVT(AccumVectorType)); + switch (TA) { + default: + return Invalid; + case TargetLowering::TypeLegal: + case TargetLowering::TypePromoteInteger: + case TargetLowering::TypeSplitVector: + break; + } + + // Check what kind of type-legalisation happens. + std::pair<InstructionCost, MVT> AccumLT = + getTypeLegalizationCost(AccumVectorType); + std::pair<InstructionCost, MVT> InputLT = + getTypeLegalizationCost(InputVectorType); - unsigned VFMinValue = VF.getKnownMinValue(); + InstructionCost Cost = InputLT.first * TTI::TCC_Basic; - if (VF.isScalable()) { - if (!ST->isSVEorStreamingSVEAvailable()) - return Invalid; + // Prefer using full types by costing half-full input types as more expensive. + if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(), + TypeSize::getScalable(128))) + // FIXME: This can be removed after the cost of the extends are folded into + // the dot-product expression in VPlan, after landing: + // https://github.com/llvm/llvm-project/pull/147302 + Cost *= 2; - // Don't accept a partial reduction if the scaled accumulator is vscale x 1, - // since we can't lower that type. - unsigned Scale = - AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); - if (VFMinValue == Scale) - return Invalid; + if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) { + // i16 -> i64 is natively supported for udot/sdot + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i16) + return Cost; + // i8 -> i64 is supported with an extra level of extends + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i8) + // FIXME: This cost should probably be a little higher, e.g. Cost + 2 + // because it requires two extra extends on the inputs. But if we'd change + // that now, a regular reduction would be cheaper because the costs of + // the extends in the IR are still counted. This can be fixed + // after https://github.com/llvm/llvm-project/pull/147302 has landed. + return Cost; } - if (VF.isFixed() && - (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) - return Invalid; - if (InputEVT == MVT::i8) { - switch (VFMinValue) { - default: - return Invalid; - case 8: - if (AccumEVT == MVT::i32) - Cost *= 2; - else if (AccumEVT != MVT::i64) - return Invalid; - break; - case 16: - if (AccumEVT == MVT::i64) - Cost *= 2; - else if (AccumEVT != MVT::i32) - return Invalid; - break; - } - } else if (InputEVT == MVT::i16) { - // FIXME: Allow i32 accumulator but increase cost, as we would extend - // it to i64. - if (VFMinValue != 8 || AccumEVT != MVT::i64) - return Invalid; - } else - return Invalid; + // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE. + if (ST->isSVEorStreamingSVEAvailable() || + (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() && + ST->hasDotProd())) { + if (AccumLT.second.getScalarType() == MVT::i32 && + InputLT.second.getScalarType() == MVT::i8) + return Cost; + } - return Cost; + // Add additional cost for the extends that would need to be inserted. + return Cost + 4; } InstructionCost diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7003a40..9446144 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2126,6 +2126,8 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, Feature45BitNumRecordsBufferResource, + FeatureSupportsXNACK, + FeatureXNACK, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 6efa78e..a4ef524 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -608,8 +608,6 @@ public: ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] : EmptySet; - const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size(); - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { // Each iteration of this loop assigns exactly one global variable to // exactly one of the implementation strategies. @@ -649,8 +647,7 @@ public: ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (K.second.size() == HybridModuleRootKernelsSize && - set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second == HybridModuleRootKernels) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2d5ae29..2120bf8 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2303,7 +2303,10 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; if (!hasArchitectedFlatScratch()) KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; - KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; + bool ReservedXnackMask = STI.hasFeature(AMDGPU::FeatureXNACK); + assert(!ReservedXnackMask || STI.hasFeature(AMDGPU::FeatureSupportsXNACK)); + KdStream << Indent << ".amdhsa_reserve_xnack_mask " << ReservedXnackMask + << '\n'; KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIORITY); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index fed3778..82789bc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -722,7 +722,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; } - if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + if (New->getReg().isVirtual() && + !MRI->constrainRegClass(New->getReg(), ConstrainRC)) { LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) << TRI->getRegClassName(ConstrainRC) << '\n'); return false; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index e4b3528..0189e7b 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -306,7 +306,8 @@ class PrologEpilogSGPRSpillBuilder { buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, FI, FrameReg, DwordOff); - MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); + assert(SubReg.isPhysical()); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f7265c5..79876ff 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -18879,7 +18879,7 @@ bool SITargetLowering::checkForPhysRegDependency( PhysReg = AMDGPU::SCC; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); - Cost = RC->getCopyCost(); + Cost = RC->expensiveOrImpossibleToCopy() ? -1 : RC->getCopyCost(); return true; } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 76bfce8..5e27b37 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1013,6 +1013,15 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } else if (T == X_CNT) { + WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; + if (PendingEvents & (1 << OtherEvent)) { + // Hardware inserts an implicit xcnt between interleaved + // SMEM and VMEM operations. So there will never be + // outstanding address translations for both SMEM and + // VMEM at the same time. + setScoreLB(T, CurrScore - 1); + PendingEvents &= ~(1 << OtherEvent); + } for (const MachineOperand &Op : Inst.all_uses()) setScoreByOperand(&Inst, Op, T, CurrScore); } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { @@ -2220,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. + // For architectures with X_CNT, mark the source address operands + // with the appropriate counter values. // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. bool IsVMEMAccess = false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 56435a5..cda8069 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2112,8 +2112,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::SI_RESTORE_S32_FROM_VGPR: MI.setDesc(get(AMDGPU::V_READLANE_B32)); - MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(), - &AMDGPU::SReg_32_XM0RegClass); break; case AMDGPU::AV_MOV_B32_IMM_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); @@ -8117,21 +8115,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // hope for the best. if (Inst.isCopy() && DstReg.isPhysical() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { - // TODO: Only works for 32 bit registers. - if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) { - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), DstReg) - .add(Inst.getOperand(1)); - } else { - Register NewDst = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), NewDst) - .add(Inst.getOperand(1)); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), - DstReg) - .addReg(NewDst); - } + Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), NewDst) + .add(Inst.getOperand(1)); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), + DstReg) + .addReg(NewDst); + Inst.eraseFromParent(); return; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 205237f..3c2dd42 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2222,8 +2222,6 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, // Don't need to write VGPR out. } - MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); - // Restore clobbered registers in the specified restore block. MI = RestoreMBB.end(); SB.setMI(&RestoreMBB, MI); @@ -2238,7 +2236,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, SB.NumSubRegs == 1 ? SB.SuperReg : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); - MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); + + assert(SubReg.isPhysical()); bool LastSubReg = (i + 1 == e); auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) @@ -3059,8 +3058,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (IsSALU && LiveSCC) { Register NewDest; if (IsCopy) { - MF->getRegInfo().constrainRegClass(ResultReg, - &AMDGPU::SReg_32_XM0RegClass); + assert(ResultReg.isPhysical()); NewDest = ResultReg; } else { NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, @@ -3190,8 +3188,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Register NewDest; if (IsCopy) { - MF->getRegInfo().constrainRegClass(ResultReg, - &AMDGPU::SReg_32_XM0RegClass); NewDest = ResultReg; } else { NewDest = RS->scavengeRegisterBackwards( diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 8f1dd62..5630580 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, let HasSGPR = 1; let Size = 64; } + +def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} + +def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128_Align2, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} } // End GeneratePressureSet = 0 // Define a register tuple class, along with one requiring an even diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index fa130a1..26ff54c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -775,6 +775,16 @@ class VectorType; bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; + /// Return true if it is profitable to fold a pair of shifts into a mask. + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getScalarSizeInBits() <= 32; + } + bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index d0dfa47..0040504 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -449,6 +449,7 @@ HexagonTargetLowering::initializeHVXLowering() { // Include cases which are not hander earlier setOperationAction(ISD::UINT_TO_FP, MVT::v32i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v64i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom); setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT}); } @@ -2337,7 +2338,7 @@ HexagonTargetLowering::LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const { return ExpandHvxFpToInt(Op, DAG); } -// For vector type v32i1 uint_to_fp to v32f32: +// For vector type v32i1 uint_to_fp/sint_to_fp to v32f32: // R1 = #1, R2 holds the v32i1 param // V1 = vsplat(R1) // V2 = vsplat(R2) @@ -2464,7 +2465,7 @@ HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const { MVT IntTy = ty(Op.getOperand(0)).getVectorElementType(); MVT FpTy = ResTy.getVectorElementType(); - if (Op.getOpcode() == ISD::UINT_TO_FP) { + if (Op.getOpcode() == ISD::UINT_TO_FP || Op.getOpcode() == ISD::SINT_TO_FP) { if (ResTy == MVT::v32f32 && ty(Op.getOperand(0)) == MVT::v32i1) return LowerHvxPred32ToFp(Op, DAG); if (ResTy == MVT::v64f16 && ty(Op.getOperand(0)) == MVT::v64i1) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 4cfbfca..7ddf996 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2860,8 +2860,7 @@ static SDValue fillSubVectorFromBuildVector(BuildVectorSDNode *Node, EVT ResTy, unsigned first) { unsigned NumElts = ResTy.getVectorNumElements(); - assert(first >= 0 && - first + NumElts <= Node->getSimpleValueType(0).getVectorNumElements()); + assert(first + NumElts <= Node->getSimpleValueType(0).getVectorNumElements()); SmallVector<SDValue, 16> Ops(Node->op_begin() + first, Node->op_begin() + first + NumElts); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 395d2c4..662d3f6 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -629,7 +629,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FTAN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FEXP, G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, G_FSINH, - G_FTANH}) + G_FTANH, G_FMODF}) .libcallFor({s32, s64}) .libcallFor(ST.is64Bit(), {s128}); getActionDefinitionsBuilder({G_FPOWI, G_FLDEXP}) diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index af1ceb6..cf6f83a 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -110,16 +110,16 @@ def : StPat<truncstorei8, SB, GPR, i16>; let Predicates = [HasAtomicLdSt] in { // Prefer unsigned due to no c.lb in Zcb. - def : LdPat<atomic_load_aext_8, LBU, i16>; - def : LdPat<atomic_load_nonext_16, LH, i16>; + def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>; + def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>; - def : StPat<atomic_store_8, SB, GPR, i16>; - def : StPat<atomic_store_16, SH, GPR, i16>; + def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>; + def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>; } let Predicates = [HasAtomicLdSt, IsRV64] in { // Load pattern is in RISCVInstrInfoA.td and shared with RV32. - def : StPat<atomic_store_32, SW, GPR, i32>; + def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 50649cf..dcce2d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -533,7 +533,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FREM, ISD::FPOW, ISD::FPOWI, ISD::FCOS, ISD::FSIN, ISD::FSINCOS, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FLOG, ISD::FLOG2, - ISD::FLOG10, ISD::FLDEXP, ISD::FFREXP}, + ISD::FLOG10, ISD::FLDEXP, ISD::FFREXP, ISD::FMODF}, MVT::f16, Promote); // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have diff --git a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp index 6c19049..024030d 100644 --- a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp @@ -206,8 +206,8 @@ Filler::findDelayInstr(MachineBasicBlock &MBB, if (!done) --I; - // skip debug instruction - if (I->isDebugInstr()) + // Skip meta instructions. + if (I->isMetaInstruction()) continue; if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() || diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 143c4c4..e7709ef 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -149,6 +149,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, }); } + getActionDefinitionsBuilder({G_UMIN, G_UMAX, G_SMIN, G_SMAX}) + .widenScalarToNextPow2(0, /*Min=*/32) + .lower(); + // integer addition/subtraction getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({s8, s16, s32}) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cda5568..02b20b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45457,7 +45457,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget) { EVT SrcVT = Src.getValueType(); - if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) + if (Subtarget.useSoftFloat() || !SrcVT.isSimple() || + SrcVT.getScalarType() != MVT::i1) return SDValue(); // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type @@ -58134,6 +58135,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) return V; + // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending + // on the scheduler model. Limit multiple users to AVX+ targets to prevent + // introducing extra register moves. + if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL)) + if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode())) + return getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT.getSimpleVT(), + Op0, 1, DAG); + // Canonicalize hidden LEA pattern: // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y) // iff c < 4 |