diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 166 |
1 files changed, 158 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f1a8ee1..fbaf9bc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -1061,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1254,6 +1256,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1475,6 +1496,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1519,6 +1546,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1548,7 +1595,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -1599,18 +1648,32 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4440,19 +4503,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -14154,6 +14226,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) @@ -15869,6 +15943,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15917,6 +16063,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: |