diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 |
1 files changed, 29 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 63826b7..e866bd4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; @@ -6106,6 +6115,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: + case MVT::bf16: return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); default: break; @@ -10877,6 +10887,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { return UseSelectionDAGPTRADD && PtrVT == MVT::i64; @@ -10898,8 +10915,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { if ((C1 = dyn_cast<ConstantSDNode>(N0))) N0 = SDValue(); else if (DAG.isBaseWithConstantOffset(N0)) { - C1 = cast<ConstantSDNode>(N0.getOperand(1)); - N0 = N0.getOperand(0); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no + // unsigned overflow. + bool CheckNUW = AMDGPU::isGFX1250(*Subtarget); + if (!CheckNUW || isNoUnsignedWrap(N0)) { + C1 = cast<ConstantSDNode>(N0.getOperand(1)); + N0 = N0.getOperand(0); + } } if (C1) { @@ -16902,7 +16925,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, if (RC) { if (NumRegs > 1) { - if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs()) + if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs()) return std::pair(0U, nullptr); uint32_t Width = NumRegs * 32; @@ -17695,6 +17718,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && RMW->hasMetadata("amdgpu.no.remote.memory")) return true; + if (Subtarget.hasEmulatedSystemScopeAtomics()) + return true; } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) return true; @@ -17942,8 +17967,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // Always expand system scope min/max atomics. - if (HasSystemScope) + if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) return AtomicExpansionKind::CmpXChg; } |