diff options
Diffstat (limited to 'llvm/lib/Target')
124 files changed, 3450 insertions, 2452 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ca09598..99f0af5 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -39,8 +39,8 @@ let Predicates = [HasDotProd] in { def ext_addv_to_udot_addv : GICombineRule< (defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo), (match (wip_match_opcode G_VECREDUCE_ADD):$root, - [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]), - (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) + [{ return matchExtAddvToDotAddv(*${root}, MRI, STI, ${matchinfo}); }]), + (apply [{ applyExtAddvToDotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) >; } @@ -62,8 +62,10 @@ class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICom def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>; def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>; +def push_mul_through_zext : push_opcode_through_ext<G_MUL, G_ZEXT>; def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>; def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>; +def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, @@ -75,8 +77,10 @@ def AArch64PreLegalizerCombiner: GICombiner< ext_uaddv_to_uaddlv, push_sub_through_zext, push_add_through_zext, + push_mul_through_zext, push_sub_through_sext, - push_add_through_sext]> { + push_add_through_sext, + push_mul_through_sext]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index eca7ca5..ad42f4b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM_PSEUDO, AArch64::LDNT1B_2Z_PSEUDO); @@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM_PSEUDO, AArch64::LDNT1H_2Z_PSEUDO); @@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM_PSEUDO, AArch64::LDNT1W_2Z_PSEUDO); @@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM_PSEUDO, AArch64::LDNT1D_2Z_PSEUDO); @@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM_PSEUDO, AArch64::LDNT1B_4Z_PSEUDO); @@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM_PSEUDO, AArch64::LDNT1H_4Z_PSEUDO); @@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM_PSEUDO, AArch64::LDNT1W_4Z_PSEUDO); @@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM_PSEUDO, AArch64::LDNT1D_4Z_PSEUDO); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b49754..4f6e3dd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8952,6 +8952,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; @@ -8991,6 +8992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { @@ -11325,7 +11330,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue AArch64TargetLowering::LowerSELECT_CC( ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, - iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs, + iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -11386,6 +11391,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Canonicalise absolute difference patterns: + // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> + // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc + // + // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> + // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc + // The second forms can be matched into subs+cneg. + if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { + if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); + else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } + unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in @@ -11523,7 +11544,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return true; } })) { - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs(); SDValue VectorCmp = emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG); if (VectorCmp) @@ -11537,7 +11558,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - if (DAG.getTarget().Options.UnsafeFPMath) { + if (Flags.hasNoSignedZeros()) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); @@ -11616,10 +11637,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); - bool HasNoNans = Op->getFlags().hasNoNaNs(); + SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, - DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11627,7 +11647,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); - bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11694,8 +11713,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = - LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); + SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), + Op->getFlags(), DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -12292,7 +12311,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags = SDNodeFlags::AllowReassociation; + // Ensure nodes can be recognized by isAssociativeAndCommutative. + SDNodeFlags Flags = + SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -16674,7 +16695,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath)); + I->getFastMathFlags().allowContract())); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -24112,6 +24133,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -24194,6 +24269,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; @@ -26926,6 +27004,23 @@ static SDValue performSHLCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS); } +static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntrinsicID = N->getConstantOperandVal(1); + auto Register = + (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR + : AArch64SysReg::RNDRRS); + SDLoc DL(N); + SDValue A = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), + N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); + SDValue B = DAG.getNode( + AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); + return DAG.getMergeValues( + {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -27241,22 +27336,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: - case Intrinsic::aarch64_rndrrs: { - unsigned IntrinsicID = N->getConstantOperandVal(1); - auto Register = - (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR - : AArch64SysReg::RNDRRS); - SDLoc DL(N); - SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), - N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); - return DAG.getMergeValues( - {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); - } + case Intrinsic::aarch64_rndrrs: + return performRNDRCombine(N, DAG); case Intrinsic::aarch64_sme_ldr_zt: return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), DAG.getVTList(MVT::Other), N->getOperand(0), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 95d0e3b..ea63edd8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -662,7 +662,7 @@ private: SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, iterator_range<SDNode::user_iterator> Users, - bool HasNoNans, const SDLoc &dl, + SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8685d7a0..59d4fd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || + return Options.AllowFPOpFusion == FPOpFusion::Fast || Inst.getFlag(MachineInstr::FmContract); - return true; } return false; } @@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, case AArch64::FMUL_ZZZ_H: case AArch64::FMUL_ZZZ_S: case AArch64::FMUL_ZZZ_D: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || - (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz)); + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // == Integer types == // -- Base instructions -- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 07cacfa..251fd44 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index abcd550..b97d622 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -12,7 +12,7 @@ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// MOVi64imm + ADDXrr ==> ADDXri + ADDXri // // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi // MOVi64imm + SUBXrr ==> SUBXri + SUBXri @@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { template <typename T> bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + // Strategy used to split logical immediate bitmasks. + enum class SplitStrategy { + Intersect, + }; template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); + bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); - if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) - return false; - - // If this immediate can be handled by one instruction, do not split it. - SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; - AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); - if (Insn.size() == 1) - return false; // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, - unsigned OtherOpc) { +bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, + unsigned OtherOpc) { // Try below transformation. // // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri @@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, return splitTwoPartImm<T>( MI, - [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { - if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { + // If this immediate is already a suitable bitmask, don't split it. + // TODO: Should we just combine the two instructions in this case? + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return std::nullopt; + + // If this immediate can be handled by one instruction, don't split it. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return std::nullopt; + + bool SplitSucc = false; + switch (Strategy) { + case SplitStrategy::Intersect: + SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; + } + if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, @@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= visitINSERT(MI); break; case AArch64::ANDWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDSWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + Changed |= trySplitLogicalImm<uint32_t>( + AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri); break; case AArch64::ANDSXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + Changed |= trySplitLogicalImm<uint64_t>( + AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; case AArch64::ORRWrs: Changed |= visitORR(MI); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 40f49da..e1adc0b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -270,6 +270,13 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMECallAttrs CallAttrs(*Caller, *Callee); + // Never inline a function explicitly marked as being streaming, + // into a non-streaming function. Assume it was marked as streaming + // for a reason. + if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() && + CallAttrs.callee().hasStreamingInterfaceOrBody()) + return false; + // When inlining, we should consider the body of the function, not the // interface. if (CallAttrs.callee().hasStreamingBody()) { @@ -4905,14 +4912,17 @@ void AArch64TTIImpl::getUnrollingPreferences( // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; - // No need to unroll auto-vectorized loops - if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) - return; - // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. + // inlining. Don't unroll auto-vectorized loops either, though do allow + // unrolling of the scalar remainder. + bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + // Both auto-vectorized loops and the scalar remainder have the + // isvectorized attribute, so differentiate between them by the presence + // of vector instructions. + if (IsVectorized && I.getType()->isVectorTy()) + return; if (isa<CallBase>(I)) { if (isa<CallInst>(I) || isa<InvokeInst>(I)) if (const Function *F = cast<CallBase>(I).getCalledFunction()) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 0b79850..1a15075 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -50,8 +50,10 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub, // // %sub = G_SUB 0, %y // %cmp = G_ICMP eq/ne, %z, %sub + // or with signed comparisons with the no-signed-wrap flag set if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB || - !CmpInst::isEquality(Pred)) + (!CmpInst::isEquality(Pred) && + !(CmpInst::isSigned(Pred) && MaybeSub->getFlag(MachineInstr::NoSWrap)))) return false; auto MaybeZero = getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 1381a9b..d905692 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1810,7 +1810,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp( // Couldn't optimize. Emit a compare + a Bcc. MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - auto PredOp = ICmp.getOperand(1); + auto &PredOp = ICmp.getOperand(1); emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( static_cast<CmpInst::Predicate>(PredOp.getPredicate())); @@ -2506,12 +2506,12 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { return false; } auto &PredOp = Cmp->getOperand(1); - auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - const AArch64CC::CondCode InvCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); MIB.setInstrAndDebugLoc(I); emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), /*RHS=*/Cmp->getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); + const AArch64CC::CondCode InvCC = + changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); I.eraseFromParent(); return true; @@ -3574,10 +3574,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } - auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); + auto &PredOp = I.getOperand(1); + emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); - emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, /*Src2=*/AArch64::WZR, InvCC, MIB); I.eraseFromParent(); @@ -5096,11 +5097,11 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { - auto Pred = - static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); + auto &PredOp = CondDef->getOperand(1); + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp, + MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); CondCode = changeICMPPredToAArch64CC(Pred); - emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB); } else { // Get the condition code for the select. auto Pred = @@ -5148,29 +5149,37 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); + // Given this: // // x = G_SUB 0, y - // G_ICMP x, z + // G_ICMP z, x // // Produce this: // - // cmn y, z - if (isCMN(LHSDef, P, MRI)) - return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + // cmn z, y + if (isCMN(RHSDef, P, MRI)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); - // Same idea here, but with the RHS of the compare instead: + // Same idea here, but with the LHS of the compare instead: // // Given this: // // x = G_SUB 0, y - // G_ICMP z, x + // G_ICMP x, z // // Produce this: // - // cmn z, y - if (isCMN(RHSDef, P, MRI)) - return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + // cmn y, z + // + // But be careful! We need to swap the predicate! + if (isCMN(LHSDef, P, MRI)) { + if (!CmpInst::isEquality(P)) { + P = CmpInst::getSwappedPredicate(P); + Predicate = MachineOperand::CreatePredicate(P); + } + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + } // Given this: // diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bb0f667b..e0e1af7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; }; + auto LowerTriOp = [&MI, &MIB](unsigned Opcode) { + MIB.buildInstr(Opcode, {MI.getOperand(0)}, + {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)}); + MI.eraseFromParent(); + return true; + }; Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { @@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_USUBSAT); break; } + case Intrinsic::aarch64_neon_udot: + return LowerTriOp(AArch64::G_UDOT); + case Intrinsic::aarch64_neon_sdot: + return LowerTriOp(AArch64::G_SDOT); case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 1cd9453..8c10673 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -228,12 +228,13 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); } -// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) -// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) +// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1)) // Similar to performVecReduceAddCombine in SelectionDAG -bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -246,31 +247,57 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) return false; - LLT SrcTy; - auto I1Opc = I1->getOpcode(); - if (I1Opc == TargetOpcode::G_MUL) { + // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT + // then the ext's must match the same opcode. It is set to the ext opcode on + // output. + auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1, + Register &Out2, unsigned &I1Opc) { // If result of this has more than 1 use, then there is no point in creating - // udot instruction - if (!MRI.hasOneNonDBGUse(MidReg)) + // a dot instruction + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; MachineInstr *ExtMI1 = - getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); MachineInstr *ExtMI2 = - getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(2).getReg(), MRI); LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) return false; + if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) && + I1Opc != ExtMI1->getOpcode()) + return false; + Out1 = ExtMI1->getOperand(1).getReg(); + Out2 = ExtMI2->getOperand(1).getReg(); I1Opc = ExtMI1->getOpcode(); - SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); + return true; + }; + + LLT SrcTy; + unsigned I1Opc = I1->getOpcode(); + if (I1Opc == TargetOpcode::G_MUL) { + Register Out1, Out2; + if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc)) + return false; + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; } else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) { - SrcTy = MRI.getType(I1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = 0; + Register I1Op = I1->getOperand(1).getReg(); + MachineInstr *M = getDefIgnoringCopies(I1Op, MRI); + Register Out1, Out2; + if (M->getOpcode() == TargetOpcode::G_MUL && + tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) { + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; + } else { + SrcTy = MRI.getType(I1Op); + std::get<0>(MatchInfo) = I1Op; + std::get<1>(MatchInfo) = 0; + } } else { return false; } @@ -288,11 +315,11 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } -void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &Builder, - GISelChangeObserver &Observer, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &Builder, + GISelChangeObserver &Observer, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -553,15 +580,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } -// Pushes ADD/SUB through extend instructions to decrease the number of extend -// instruction at the end by allowing selection of {s|u}addl sooner - +// Pushes ADD/SUB/MUL through extend instructions to decrease the number of +// extend instruction at the end by allowing selection of {s|u}addl sooner // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, Register DstReg, Register SrcReg1, Register SrcReg2) { assert((MI.getOpcode() == TargetOpcode::G_ADD || - MI.getOpcode() == TargetOpcode::G_SUB) && - "Expected a G_ADD or G_SUB instruction\n"); + MI.getOpcode() == TargetOpcode::G_SUB || + MI.getOpcode() == TargetOpcode::G_MUL) && + "Expected a G_ADD, G_SUB or G_MUL instruction\n"); // Deal with vector types only LLT DstTy = MRI.getType(DstReg); @@ -594,9 +621,10 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); // G_SUB has to sign-extend the result. - // G_ADD needs to sext from sext and can sext or zext from zext, so the - // original opcode is used. - if (MI.getOpcode() == TargetOpcode::G_ADD) + // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL + // needs to use the original opcode so the original opcode is used for both. + if (MI.getOpcode() == TargetOpcode::G_ADD || + MI.getOpcode() == TargetOpcode::G_MUL) B.buildInstr(Opc, {DstReg}, {AddReg}); else B.buildSExt(DstReg, AddReg); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index b9d3e1b..6912caf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -461,7 +461,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8b8fc8b..8a0c4ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; +def FeatureCUStores : SubtargetFeature<"cu-stores", + "HasCUStores", + "true", + "Whether SCOPE_CU stores can be used on GFX12.5" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -1383,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1988,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, + FeatureCUStores, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, @@ -2042,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2422,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts : def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; @@ -2565,6 +2576,10 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasAddMinMaxInsts : + Predicate<"Subtarget->hasAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; @@ -2832,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 4b3dc37..6681393 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + if (ST.isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (isGFX1250(ST) && ST.hasCUStores()) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; + } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5f19837..a9278c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -static bool hasUnsafeFPMath(const Function &F) { - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); -} - class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -104,7 +100,6 @@ public: const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; - const bool HasUnsafeFPMath; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -117,7 +112,6 @@ public: const DominatorTree *DT, const UniformityInfo &UA) : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC), DT(DT), UA(UA), DL(F.getDataLayout()), - HasUnsafeFPMath(hasUnsafeFPMath(F)), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, return false; // v_rsq_f32 gives 1ulp - return SqrtFMF.approxFunc() || HasUnsafeFPMath || - SqrtOp->getFPAccuracy() >= 1.0f; + return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<>::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 Value * @@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( // // With rcp: // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn. // // With fdiv.fast: // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. @@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { RsqOp = SqrtOp->getOperand(0); } - // Inaccurate rcp is allowed with unsafe-fp-math or afn. + // Inaccurate rcp is allowed with afn. // // Defer to codegen to handle this. // @@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { // expansion of afn to codegen. The current interpretation is so aggressive we // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. - const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + const bool AllowInaccurateRcp = DivFMF.approxFunc(); if (!RsqOp && AllowInaccurateRcp) return false; @@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { // We're trying to handle the fast-but-not-that-fast case only. The lowering // of fast llvm.sqrt will give the raw instruction anyway. - if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + if (SqrtFMF.approxFunc()) return false; const float ReqdAccuracy = FPOp->getFPAccuracy(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index c01e5d3..992572f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -143,6 +143,9 @@ def gi_global_saddr_cpol : def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, + GIComplexPatternEquiv<GlobalSAddrNoIOffset>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 2991778..19b8757 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -204,7 +204,7 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const { for (auto &Op : Node->operands()) Dims.push_back(Dims.getDocument()->getNode( - uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue()))); + mdconst::extract<ConstantInt>(Op)->getZExtValue())); return Dims; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index dfaa145..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2049,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5636d89..983f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -174,6 +174,8 @@ private: bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f25ce87..31c4f62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2634,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, @@ -2757,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, const auto &Options = getTargetMachine().Options; if (VT == MVT::f16 || Flags.hasApproximateFuncs() || - Options.ApproxFuncFPMath || Options.UnsafeFPMath) { + Options.ApproxFuncFPMath) { if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { // Log and multiply in f32 is good enough for f16. @@ -3585,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con if (N0.getValueType() == MVT::f32) return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } @@ -4846,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 266dee1..b0d3b12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); @@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { } unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + if (!IsB32 && STI.hasTrue16BitInsts()) + Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64 + : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64; unsigned CBL = STI.getConstantBusLimit(Opc); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -5789,6 +5801,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; @@ -6971,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6986,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -7021,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index fe9743d0a..140e753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -264,6 +264,8 @@ private: selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7a50923..511fc69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode() def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } def FMA : Predicate<"Subtarget->hasFMA()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fedfa3f..1fdf272 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElements(0, S16, 2) - .minScalar(0, S16) - .widenScalarToNextPow2(0) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ABS) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + if (ST.hasIntMinMax64()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, S64, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) @@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); - if (ST.hasGFX90AInsts()) { + if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer @@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::scalar(32), commonAlignment(Align(64), Offset)); // Pointer address - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - B.buildPtrAdd(LoadAddr, QueuePtr, - B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); + B.buildObjectPtrOffset( + LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -3326,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { if (Flags & MachineInstr::FmAfn) return true; const auto &Options = MF.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, @@ -3432,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || - TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { + TM.Options.ApproxFuncFPMath) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); auto PromoteSrc = B.buildFPExt(F32, X); @@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, llvm_unreachable("failed to find kernarg segment ptr"); auto COffset = B.buildConstant(LLT::scalar(64), Offset); - // TODO: Should get nuw - return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); + return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0); } /// Legalize a value that's loaded from kernel arguments. This is only used by @@ -4860,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || - MF.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) @@ -4922,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (!AllowInaccurateRcp) return false; @@ -5676,8 +5689,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; - // FIXME: This should be nuw - B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildObjectPtrOffset(DstReg, KernargPtrReg, + B.buildConstant(IdxTy, Offset).getReg(0)); return true; } @@ -7019,8 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( // Pointer address Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); B.buildCopy(SGPR01, Temp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 8767208..aa75534 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -53,8 +53,6 @@ private: using FuncInfo = llvm::AMDGPULibFunc; - bool UnsafeFPMath = false; - // -fuse-native. bool AllNative = false; @@ -117,7 +115,6 @@ private: bool AllowStrictFP = false); protected: - bool isUnsafeMath(const FPMathOperator *FPOp) const; bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; @@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, return AMDGPULibFunc::parse(FMangledName, FInfo); } -bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || FPOp->isFast(); -} - bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || - (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); + return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs(); } bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( const FPMathOperator *FPOp) const { // TODO: Refine to approxFunc or contract - return isUnsafeMath(FPOp); + return FPOp->isFast(); } void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { - UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); AC = &FAM.getResult<AssumptionAnalysis>(F); TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f471881..b45627d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, BasePlusOffset = Base; } else { auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); - BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); + BasePlusOffset = + B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0); } auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c5a1d9e..c8e45d4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4009,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: @@ -4022,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: + if (isSALUMapping(MI)) { + // There are no scalar 64-bit min and max, use vector instruction instead. + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 && + Subtarget.hasIntMinMax64()) + return getDefaultMappingVOP(MI); + return getDefaultMappingSOP(MI); + } + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: case AMDGPU::G_FMUL: @@ -4566,8 +4574,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pknorm_u16: case Intrinsic::amdgcn_cvt_pk_i16: case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_cvt_sr_pk_f16_f32: + case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f16: + case Intrinsic::amdgcn_cvt_pk_bf8_f16: + case Intrinsic::amdgcn_cvt_sr_fp8_f16: + case Intrinsic::amdgcn_cvt_sr_bf8_f16: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -4619,8 +4642,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_f32_fp8: case Intrinsic::amdgcn_cvt_pk_f32_bf8: case Intrinsic::amdgcn_cvt_pk_fp8_f32: + case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3: case Intrinsic::amdgcn_cvt_pk_bf8_f32: case Intrinsic::amdgcn_cvt_sr_fp8_f32: + case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3: case Intrinsic::amdgcn_cvt_sr_bf8_f32: case Intrinsic::amdgcn_cvt_sr_bf16_f32: case Intrinsic::amdgcn_cvt_sr_f16_f32: @@ -5364,6 +5389,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..c1f1703 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,7 +104,9 @@ #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" @@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. @@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); - // TODO: LICM + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true)); + } } Base::addIRPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 24f4df2..a0c99b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( // Estimate all types may be fused with contract/unsafe flags const TargetOptions &Options = TLI->getTargetMachine().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || (FAdd->hasAllowContract() && CxtI->hasAllowContract())) return TargetTransformInfo::TCC_Free; } @@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } - if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || - TLI->getTargetMachine().Options.UnsafeFPMath)) { + if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) { // Fast unsafe fdiv lowering: // f32 rcp // f32 fmul diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 421fc42..a83caa0 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -180,6 +180,7 @@ public: ImmTyMatrixBFMT, ImmTyMatrixAReuse, ImmTyMatrixBReuse, + ImmTyScaleSel, ImmTyByteSel, }; @@ -689,6 +690,8 @@ public: bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } + bool isVISrcB32() const { return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); } @@ -1182,6 +1185,7 @@ public: case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; + case ImmTyScaleSel: OS << "ScaleSel" ; break; case ImmTyByteSel: OS << "ByteSel" ; break; } // clang-format on @@ -2036,6 +2040,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2405,6 +2410,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: @@ -2456,6 +2462,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo setImmKindConst(); return; } + [[fallthrough]]; + + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: Inst.addOperand(MCOperand::createImm(Lo_32(Val))); setImmKindLiteral(); @@ -3761,6 +3770,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16) return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16) + return false; + llvm_unreachable("invalid operand type"); } default: @@ -6066,6 +6078,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; + } else if (ID == ".amdhsa_uses_cu_stores") { + if (!isGFX1250()) + return Error(IDRange.Start, "directive requires gfx12.5", IDRange); + + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) @@ -9350,6 +9368,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, } } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::scale_sel)) + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyScaleSel); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyClamp); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) { if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) Inst.addOperand(Inst.getOperand(0)); @@ -9357,10 +9383,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, AMDGPUOperand::ImmTyByteSel); } - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) - addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTyClamp); - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); @@ -9414,8 +9436,22 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) { + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { Inst.addOperand(Inst.getOperand(0)); } @@ -10010,9 +10046,12 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClamp); - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) { + if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) + Inst.addOperand(Inst.getOperand(0)); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyByteSel); + } if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index f99e716..1956a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { } //===----------------------------------------------------------------------===// -// MUBUF - GFX11, GFX12. +// MUBUF - GFX11, GFX12, GFX1250. //===----------------------------------------------------------------------===// // gfx11 instruction that accept both old and new assembler name. @@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op, def : Mnem_gfx12<gfx11_name, gfx12_name>; } +multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> { + def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>; +} + defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>; defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>; @@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; +defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>; +defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">; +defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">; + //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 42edec0..c466f9c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen Instrumentation MC MIRParser + ObjCARC Passes Scalar SelectionDAG diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 319cc9d..3ff675d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +defm DS_ADD_F64 : DS_Real_gfx12<0x054>; +defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>; + let AssemblerPredicate = HasLdsBarrierArriveAtomic in { defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 5c1989b..ffe6b06 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (isGFX1250()) + PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", + KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7207c25..d5d1074 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,6 +11,7 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; + def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; @@ -369,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -1156,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -1315,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) @@ -1525,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatStoreLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; @@ -2091,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in { defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; } // End SubtargetPredicate = isGFX125xOnly +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>; + + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -3374,12 +3473,29 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; +defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; + +defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">; +defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">; + def True16D16Table : GenericTable { let FilterClass = "True16D16Table"; let CppTypeName = "True16D16Info"; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ce1ce68..96d5668 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -592,10 +592,13 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -666,10 +669,13 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, // MaxMemoryClause-specific: We prioritize clustered instructions as we would // get more benefit from clausing these memory instructions. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -896,15 +902,10 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const { assert(!Regions.empty()); std::vector<MachineInstr *> RegionFirstMIs; RegionFirstMIs.reserve(Regions.size()); - auto I = Regions.rbegin(), E = Regions.rend(); - do { - const MachineBasicBlock *MBB = I->first->getParent(); - auto *MI = &*skipDebugInstructionsForward(I->first, I->second); - RegionFirstMIs.push_back(MI); - do { - ++I; - } while (I != E && I->first->getParent() == MBB); - } while (I != E); + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + RegionFirstMIs.push_back( + &*skipDebugInstructionsForward(RegionBegin, RegionEnd)); + return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); } @@ -941,11 +942,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() { Pressure.resize(Regions.size()); RegionsWithHighRP.resize(Regions.size()); RegionsWithExcessRP.resize(Regions.size()); - RegionsWithMinOcc.resize(Regions.size()); RegionsWithIGLPInstrs.resize(Regions.size()); RegionsWithHighRP.reset(); RegionsWithExcessRP.reset(); - RegionsWithMinOcc.reset(); RegionsWithIGLPInstrs.reset(); runSchedStages(); @@ -1095,8 +1094,7 @@ bool PreRARematStage::initGCNSchedStage() { // fixed if there is another pass after this pass. assert(!S.hasNextStage()); - if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || - DAG.Regions.size() == 1) + if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1) return false; // Before performing any IR modification record the parent region of each MI @@ -1138,11 +1136,6 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { - for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) - DAG.RegionsWithMinOcc[IDX] = - DAG.Pressure[IDX].getOccupancy( - DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; - LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); @@ -1214,11 +1207,15 @@ bool GCNSchedStage::initGCNRegion() { } bool UnclusteredHighRPStage::initGCNRegion() { - // Only reschedule regions with the minimum occupancy or regions that may have - // spilling (excess register pressure). - if ((!DAG.RegionsWithMinOcc[RegionIdx] || - DAG.MinOccupancy <= InitialOccupancy) && - !DAG.RegionsWithExcessRP[RegionIdx]) + // Only reschedule regions that have excess register pressure (i.e. spilling) + // or had minimum occupancy at the beginning of the stage (as long as + // rescheduling of previous regions did not make occupancy drop back down to + // the initial minimum). + unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + if (!DAG.RegionsWithExcessRP[RegionIdx] && + (DAG.MinOccupancy <= InitialOccupancy || + DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != + InitialOccupancy)) return false; return GCNSchedStage::initGCNRegion(); @@ -1283,9 +1280,6 @@ void GCNSchedStage::checkScheduling() { if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; - DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; // Early out if we have achieved the occupancy target. LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); @@ -1319,7 +1313,6 @@ void GCNSchedStage::checkScheduling() { if (NewOccupancy < DAG.MinOccupancy) { DAG.MinOccupancy = NewOccupancy; MFI.limitOccupancy(DAG.MinOccupancy); - DAG.RegionsWithMinOcc.reset(); LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << DAG.MinOccupancy << ".\n"); } @@ -1341,14 +1334,10 @@ void GCNSchedStage::checkScheduling() { // Revert if this region's schedule would cause a drop in occupancy or // spilling. - if (shouldRevertScheduling(WavesAfter)) { + if (shouldRevertScheduling(WavesAfter)) revertScheduling(); - } else { + else DAG.Pressure[RegionIdx] = PressureAfter; - DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; - } } unsigned @@ -1578,9 +1567,6 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { } void GCNSchedStage::revertScheduling() { - DAG.RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) == - DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 94cd795..32139a9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -250,9 +250,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // limit. Register pressure in these regions usually will result in spilling. BitVector RegionsWithExcessRP; - // Regions that has the same occupancy as the latest MinOccupancy - BitVector RegionsWithMinOcc; - // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT). BitVector RegionsWithIGLPInstrs; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 0a0a107..0237a60 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -340,6 +340,43 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } +void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy, + const SchedRegion &Region) const { + const Function &F = Region.RegionBegin->getMF()->getFunction(); + Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction"); + if (!PostRADirectionAttr.isValid()) + return; + + StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString(); + if (PostRADirectionStr == "topdown") { + Policy.OnlyTopDown = true; + Policy.OnlyBottomUp = false; + } else if (PostRADirectionStr == "bottomup") { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = true; + } else if (PostRADirectionStr == "bidirectional") { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } else { + DiagnosticInfoOptimizationFailure Diag( + F, F.getSubprogram(), "invalid value for postRA direction attribute"); + F.getContext().diagnose(Diag); + } + + LLVM_DEBUG({ + const char *DirStr = "default"; + if (Policy.OnlyTopDown && !Policy.OnlyBottomUp) + DirStr = "topdown"; + else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp) + DirStr = "bottomup"; + else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp) + DirStr = "bidirectional"; + + dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr + << '\n'; + }); +} + void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { if (isWave32()) { // Fix implicit $vcc operands after MIParser has verified that they match diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 785ede3..6fe3abc 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -248,6 +248,7 @@ protected: bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; + bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -272,6 +273,7 @@ protected: bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -714,7 +716,9 @@ public: bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } // DS_ADD_F64/DS_ADD_RTN_F64 - bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } + bool hasLdsAtomicAddF64() const { + return hasGFX90AInsts() || hasGFX1250Insts(); + } bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; @@ -998,6 +1002,8 @@ public: bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + bool hasCUStores() const { return HasCUStores; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1035,6 +1041,9 @@ public: void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override; + void overridePostRASchedPolicy(MachineSchedPolicy &Policy, + const SchedRegion &Region) const override; + void mirFileLoaded(MachineFunction &MF) const override; unsigned getMaxNumUserSGPRs() const { @@ -1516,9 +1525,22 @@ public: // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + + // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. + bool hasIntMinMax64() const { return GFX1250Insts; } + + // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. + bool hasAddMinMaxInsts() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 2a920f6..86d56855 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -149,7 +149,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 11b072e..42c4d8b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; default: llvm_unreachable("bad operand type"); } @@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: @@ -1790,4 +1793,14 @@ void AMDGPUInstPrinter::printBitOp3(const MCInst *MI, unsigned OpNo, O << formatHex(static_cast<uint64_t>(Imm)); } +void AMDGPUInstPrinter::printScaleSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint8_t Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " scale_sel:" << formatDec(Imm); +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e0b7aa5..f6739b14 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -173,6 +173,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O, StringRef Prefix, bool PrintInHex, bool AlwaysPrint); + void printScaleSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printBitOp3(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index c49ad79..f358084 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return 255; + case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 10f6d33..43ca548 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); + if (isGFX1250(STI)) + PrintField(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, + ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 40b8bcd..c564145 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -208,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, OPERAND_REG_IMM_V2FP32, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index b77da4d..e934152 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d51ec6..4d67e4a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); } + if (Subtarget->hasIntMinMax64()) + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64, + Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, @@ -1256,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1527,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1623,10 +1666,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4241,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = BaseAddr.getValue(1); Align StackAlign = TFL->getStackAlign(); if (Alignment > StackAlign) { - uint64_t ScaledAlignment = (uint64_t)Alignment.value() + uint64_t ScaledAlignment = Alignment.value() << Subtarget->getWavefrontSizeLog2(); uint64_t StackAlignMask = ScaledAlignment - 1; SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, @@ -7148,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { SDValue Flags = Op.getOperand(1); SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); @@ -11243,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { // Without !fpmath accuracy information, we can't do more because we don't @@ -11263,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // 1.0 / sqrt(x) -> rsq(x) - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // XXX - Is afn sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); @@ -11297,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateDiv = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateDiv = Flags.hasApproximateFuncs(); if (!AllowInaccurateDiv) return SDValue(); @@ -14550,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || (N0->getFlags().hasAllowContract() && N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { @@ -15673,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, // regardless of the denorm mode setting. Therefore, - // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. + // fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; - if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if (Options.AllowFPOpFusion == FPOpFusion::Fast || (N->getFlags().hasAllowContract() && FMA->getFlags().hasAllowContract())) { Op1 = Op1.getOperand(0); @@ -15896,6 +15945,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15944,6 +16065,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: @@ -16700,56 +16825,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::pair(0U, RC); } - if (Constraint.starts_with("{") && Constraint.ends_with("}")) { - StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); - if (RegName.consume_front("v")) { + auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint); + if (Kind != '\0') { + if (Kind == 'v') { RC = &AMDGPU::VGPR_32RegClass; - } else if (RegName.consume_front("s")) { + } else if (Kind == 's') { RC = &AMDGPU::SGPR_32RegClass; - } else if (RegName.consume_front("a")) { + } else if (Kind == 'a') { RC = &AMDGPU::AGPR_32RegClass; } if (RC) { - uint32_t Idx; - if (RegName.consume_front("[")) { - uint32_t End; - bool Failed = RegName.consumeInteger(10, Idx); - Failed |= !RegName.consume_front(":"); - Failed |= RegName.consumeInteger(10, End); - Failed |= !RegName.consume_back("]"); - if (!Failed) { - uint32_t Width = (End - Idx + 1) * 32; - // Prohibit constraints for register ranges with a width that does not - // match the required type. - if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) + if (NumRegs > 1) { + if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs()) + return std::pair(0U, nullptr); + + uint32_t Width = NumRegs * 32; + // Prohibit constraints for register ranges with a width that does not + // match the required type. + if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) + return std::pair(0U, nullptr); + + MCRegister Reg = RC->getRegister(Idx); + if (SIRegisterInfo::isVGPRClass(RC)) + RC = TRI->getVGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isSGPRClass(RC)) + RC = TRI->getSGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isAGPRClass(RC)) + RC = TRI->getAGPRClassForBitWidth(Width); + if (RC) { + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); + if (!Reg) { + // The register class does not contain the requested register, + // e.g., because it is an SGPR pair that would violate alignment + // requirements. return std::pair(0U, nullptr); - MCRegister Reg = RC->getRegister(Idx); - if (SIRegisterInfo::isVGPRClass(RC)) - RC = TRI->getVGPRClassForBitWidth(Width); - else if (SIRegisterInfo::isSGPRClass(RC)) - RC = TRI->getSGPRClassForBitWidth(Width); - else if (SIRegisterInfo::isAGPRClass(RC)) - RC = TRI->getAGPRClassForBitWidth(Width); - if (RC) { - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); - if (!Reg) { - // The register class does not contain the requested register, - // e.g., because it is an SGPR pair that would violate alignment - // requirements. - return std::pair(0U, nullptr); - } - return std::pair(Reg, RC); } + return std::pair(Reg, RC); } - } else { - // Check for lossy scalar/vector conversions. - if (VT.isVector() && VT.getSizeInBits() != 32) - return std::pair(0U, nullptr); - bool Failed = RegName.getAsInteger(10, Idx); - if (!Failed && Idx < RC->getNumRegs()) - return std::pair(RC->getRegister(Idx), RC); } + + // Check for lossy scalar/vector conversions. + if (VT.isVector() && VT.getSizeInBits() != 32) + return std::pair(0U, nullptr); + if (Idx < RC->getNumRegs()) + return std::pair(RC->getRegister(Idx), RC); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 520c321..4b48fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2aa6b4e..044a681 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { @@ -9281,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { default: if (MI.isMetaInstruction()) return 0; + + // If D16 Pseudo inst, get correct MC code size + const auto *D16Info = AMDGPU::getT16D16Helper(Opc); + if (D16Info) { + // Assume d16_lo/hi inst are always in same size + unsigned LoInstOpcode = D16Info->LoOp; + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 83b0490..a3e20ba 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1313,6 +1313,10 @@ def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">; def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; +def ScaleSel : NamedIntOperand<"scale_sel"> { + let Validator = "isUInt<3>"; +} + class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_KIMM"#vt.Size; @@ -2859,6 +2863,7 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; +def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; @@ -2926,6 +2931,8 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; +def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; +def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; @@ -2941,6 +2948,13 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; +def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; +def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; +def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; +def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>; +def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>; +def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>; +def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f1262e11..53f554e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2564,7 +2584,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) + // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. + if (Scope == CPol::SCOPE_CU && + (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); return false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 218841d..36d1a3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1218,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; + //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc..8303410 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..65fa088 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { @@ -1541,6 +1548,42 @@ bool shouldEmitConstantsToTextSection(const Triple &TT) { return TT.getArch() == Triple::r600; } +static bool isValidRegPrefix(char C) { + return C == 'v' || C == 's' || C == 'a'; +} + +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint) { + StringRef RegName = Constraint; + if (!RegName.consume_front("{") || !RegName.consume_back("}")) + return {}; + + char Kind = RegName.front(); + if (!isValidRegPrefix(Kind)) + return {}; + + RegName = RegName.drop_front(); + if (RegName.consume_front("[")) { + unsigned Idx, End; + bool Failed = RegName.consumeInteger(10, Idx); + Failed |= !RegName.consume_front(":"); + Failed |= RegName.consumeInteger(10, End); + Failed |= !RegName.consume_back("]"); + if (!Failed) { + unsigned NumRegs = End - Idx + 1; + if (NumRegs > 1) + return {Kind, Idx, NumRegs}; + } + } else { + unsigned Idx; + bool Failed = RegName.getAsInteger(10, Idx); + if (!Failed) + return {Kind, Idx, 1}; + } + + return {}; +} + std::pair<unsigned, unsigned> getIntegerPairAttribute(const Function &F, StringRef Name, std::pair<unsigned, unsigned> Default, @@ -2652,6 +2695,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: @@ -3016,6 +3060,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return isInlinableLiteralV2BF16(Literal); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; default: llvm_unreachable("bad packed operand type"); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c09a9d6..1252e35 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1012,6 +1012,12 @@ bool isReadOnlySegment(const GlobalValue *GV); /// target triple \p TT, false otherwise. bool shouldEmitConstantsToTextSection(const Triple &TT); +/// Returns a valid charcode or 0 in the first entry if this is a valid physical +/// register constraint. Followed by the start register number, and the register +/// width. Does not validate the number of registers exists in the class. +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint); + /// \returns Integer value requested using \p F's \p Name attribute. /// /// \returns \p Default if attribute is not present. @@ -1636,6 +1642,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: return 2; default: diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 550ec9d..9de7d6d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">; } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit] let SubtargetPredicate = HasPkFmacF16Inst in { +// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection. +// If this changes, ensure the DPP variant is not used for GFX11+. defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; } // End SubtargetPredicate = HasPkFmacF16Inst @@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName, VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> : - VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>; + VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>; multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> : VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b6f9568..1ffe39d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let HasExtDPP = 0; } -let HasExt64BitDPP = 1 in { -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +let HasExt64BitDPP = 1 in { +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; + class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { let HasExtVOP3DPP = 0; let HasExtDPP = 0; @@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>; - -def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { +def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> { let HasExtVOP3DPP = 0; - let HasExtDPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>; +def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> { + let HasClamp = 1; } } // End HasExt64BitDPP = 1; @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; +defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; +defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; +defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; +} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -601,8 +625,9 @@ def shl_0_to_4 : PatFrag< }]; } -def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { - defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel); +class VOP3_CVT_PK_F8_F32_Profile<bit _HasClamp = 0> : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { + defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)), + (ins VGPR_32:$vdst_in, op_sel0:$op_sel)); let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, 0, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret, @@ -612,12 +637,13 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, false>.ret, Tail); - let HasClamp = 0; + let HasClamp = _HasClamp; let HasExtVOP3DPP = 1; } -def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> { - defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel); +class VOP3_CVT_PK_F8_F32_Profile_fake16<bit _HasClamp = 0> : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> { + defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)), + (ins VGPR_32:$vdst_in, op_sel0:$op_sel)); let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, 0, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret, @@ -627,14 +653,15 @@ def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, false>.ret, Tail); - let HasClamp = 0; + let HasClamp = _HasClamp; let HasExtVOP3DPP = 1; } // This t16 profile with vdst_in operand is for backward compatibility and is used // for user controlled packing -def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> { - defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel); +class VOP3_CVT_PK_F8_F32_Profile_t16<bit _HasClamp = 0> : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> { + defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)), + (ins VGPR_16:$vdst_in, op_sel0:$op_sel)); let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, 0, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret, @@ -644,7 +671,7 @@ def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_O HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, false>.ret, Tail); - let HasClamp = 0; + let HasClamp = _HasClamp; let HasExtVOP3DPP = 1; } @@ -678,10 +705,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>, HasModifiers, DstVT>.ret); } -class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> : +class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT, bit _HasClamp = 0> : VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> { let HasFP8DstByteSel = 1; - let HasClamp = 0; + let HasClamp = _HasClamp; } def IsPow2Plus1: PatLeaf<(i32 imm), [{ @@ -722,6 +749,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 +let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; +} + defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; @@ -749,15 +783,23 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>; let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { - defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile, - VOP3_CVT_PK_F8_F32_Profile_t16, - VOP3_CVT_PK_F8_F32_Profile_fake16>; - defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile, - VOP3_CVT_PK_F8_F32_Profile_t16, - VOP3_CVT_PK_F8_F32_Profile_fake16>; + let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in + defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>, + VOP3_CVT_PK_F8_F32_Profile_t16<>, + VOP3_CVT_PK_F8_F32_Profile_fake16<>>; + let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in + defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32_gfx1250", VOP3_CVT_PK_F8_F32_Profile<true>, + VOP3_CVT_PK_F8_F32_Profile_t16<true>, + VOP3_CVT_PK_F8_F32_Profile_fake16<true>>; + defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile<>, + VOP3_CVT_PK_F8_F32_Profile_t16<>, + VOP3_CVT_PK_F8_F32_Profile_fake16<>>; let SubtargetPredicate = isGFX12Plus in { - defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>; + let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in + defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>; + let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in + defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>; defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>; } } @@ -776,6 +818,11 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0) >; +class Cvt_PK_F8_F32_E5M3_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst, int Clamp> : GCNPat< + (i32 (node f32:$src0, f32:$src1, i32:$old, index)), + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, Clamp, $old, 0) +>; + multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> { def : GCNPat< (i32 (node f32:$src0, f32:$src1, i32:$old, -1)), @@ -791,6 +838,21 @@ def : GCNPat< >; } +multiclass Cvt_PK_F8_F32_E5M3_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst, int Clamp> { +def : GCNPat< + (i32 (node f32:$src0, f32:$src1, i32:$old, -1)), + (REG_SEQUENCE VGPR_32, + (i16 (EXTRACT_SUBREG $old, lo16)), lo16, + (i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16) +>; +def : GCNPat< + (i32 (node f32:$src0, f32:$src1, i32:$old, 0)), + (REG_SEQUENCE VGPR_32, + (i16 (inst 0, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16, + (i16 (EXTRACT_SUBREG $old, hi16)), hi16) +>; +} + class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat< (i32 (node f32:$src0, i32:$src1, i32:$old, index)), (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, @@ -803,21 +865,37 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel)) >; +class Cvt_SR_F8_ByteSel_E5M3_Pat<SDPatternOperator node, VOP3_Pseudo inst, + ValueType SrcVT, int Clamp> : GCNPat< + (i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers), + i32:$old, timm:$byte_sel)), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, Clamp, $old, (as_i32timm $byte_sel)) +>; + let OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, -1] in { let True16Predicate = NotHasTrue16BitInsts in { - def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>; + let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in + def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>; def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>; } let True16Predicate = UseFakeTrue16Insts in { def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>; def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>; + let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in { + def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.NONE>; + def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.ENABLE>; + } } } let True16Predicate = UseRealTrue16Insts in { defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>; defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>; + let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in { + defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.NONE>; + defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.ENABLE>; + } } let SubtargetPredicate = isGFX940Plus in { @@ -828,7 +906,12 @@ let SubtargetPredicate = isGFX940Plus in { } let SubtargetPredicate = isGFX12Plus in { - def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>; + let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>; + let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in { + def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.NONE>; + def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>; + } def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>; } } @@ -848,6 +931,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>; + def : GCNPat< (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -858,6 +944,13 @@ def : GCNPat< (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +let SubtargetPredicate = HasAddMinMaxInsts in { +def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +} + def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>; @@ -972,10 +1065,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim unsigned Val = N->getZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); - } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + } else if (}] # modifier_idx # [{== 1) { + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + } if (}] # modifier_idx # [{== 2) { + New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1427,34 +1522,72 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = HasBitOp3Insts in { +let HasClamp = 0, HasModifiers = 1 in { +def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; +def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; +def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>; +} + +let OtherPredicates = [HasBitOp3Insts] in { let isReMaterializable = 1 in { - defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", - VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>; + let SubtargetPredicate = isGFX940Plus in + defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>; + let SubtargetPredicate = isGFX1250Plus in + defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile, + BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>; defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32", VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>, VOPD_Component<0x12, "v_bitop2_b32">; } + def : GCNPat< (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; def : GCNPat< - (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; - - def : GCNPat< (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; - def : GCNPat< - (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; -} // End SubtargetPredicate = HasBitOp3Insts + let SubtargetPredicate = isGFX940Plus in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } // End SubtargetPredicate = isGFX940Plus + + let SubtargetPredicate = isGFX1250Plus in { + let True16Predicate = UseFakeTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + } + } // End SubtargetPredicate = isGFX1250Plus + +} // End OtherPredicates = [HasBitOp3Insts] class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), @@ -1531,6 +1664,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; let SubtargetPredicate = HasBF16ConversionInsts in { let ReadsModeReg = 0 in { defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>; + defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>; } def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)), (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; @@ -1541,6 +1675,85 @@ let SubtargetPredicate = HasBF16ConversionInsts in { (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; } +class VOP3_CVT_SCALE_PK_F16_F864_Profile<VOPProfile P> : VOP3_CVT_SCALEF32_PK_F864_Profile<P> { + let Src0RC64 = getVOP3VRegSrcForVT<Src0VT>.ret; + let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasClamp, HasModifiers, HasSrc2Mods, + HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret, + (ins ScaleSel:$scale_sel)); + let Asm64 = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp, + HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods, + HasSrc2Mods, DstVT>.ret # "$scale_sel"; +} + +multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator node> { + def _e64 : VOP3InstBase<OpName, VOP3_CVT_SCALE_PK_F16_F864_Profile<P>> { + let Pattern = [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0)), i32:$src1, i32:$scale_sel))]; + } +} + +let Src0RC64 = VSrc_NoInline_v2f16 in { +def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; +def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; +def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>; +} + +let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in { + defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_fp8_f16>; + defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_bf8_f16>; +} + +let HasClamp = 0, HasOpSel = 1 in { +def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>; +def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>; +def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>; +} + +let SubtargetPredicate = isGFX1250Plus in { + let ReadsModeReg = 0 in { + defm V_CVT_SR_PK_F16_F32 : VOP3Inst<"v_cvt_sr_pk_f16_f32", VOP3_Profile<VOP_V2F16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_f16_f32>; + + // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel + // to select a byte in the vdst. Bits 0 and 1 are unused. + let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + } + + let Constraints = "@earlyclobber $vdst" in { + defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>; + defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>; + defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>; + defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; + defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; + defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + } // End Constraints = "@earlyclobber $vdst" + + defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; + defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>; + defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; + } // End ReadsModeReg = 0 + + let True16Predicate = UseRealTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; + } + let True16Predicate = UseFakeTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>; + } +} // End SubtargetPredicate = isGFX1250Plus + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -1746,10 +1959,20 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; -defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; -defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; -defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; -defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; + +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; +defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; +defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; +defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; +defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; +defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; +defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; +defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; +defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; //===----------------------------------------------------------------------===// // GFX11, GFX12 @@ -1911,6 +2134,13 @@ defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36 defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x363, "v_or_b16">; defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x364, "v_xor_b16">; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<0x369, "v_cvt_pk_fp8_f32">; +defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x369, "v_cvt_pk_fp8_f32">; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36a, "v_cvt_pk_bf8_f32">; +defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">; +defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Only_Realtriple_with_name_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx1250", "v_cvt_sr_fp8_f32">; +defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">; + let AssemblerPredicate = isGFX11Plus in { def : AMDGPUMnemonicAlias<"v_add3_nc_u32", "v_add3_u32">; def : AMDGPUMnemonicAlias<"v_xor_add_u32", "v_xad_u32">; @@ -1918,7 +2148,25 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; +defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; +defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x29f>; +defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a0>; +defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a1>; +defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a8>; +defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a9>; +defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; +defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; +defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; +defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; +defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; +defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; +defm V_CVT_SR_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x370>; +defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">; +defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">; +defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>; +defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c21e2d3..f027ab0 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -401,6 +401,26 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<3> scale_sel; + + let Inst{13-11} = scale_sel; + let Inst{14} = 0; +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -1506,6 +1526,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1525,6 +1546,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1540,6 +1562,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1723,6 +1746,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName> + : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName> + : Base_VOP3_DPP8_t16<op, p, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : Base_VOP3_DPP8<op, ps, opName> { bits<8> sdst; @@ -1943,6 +1994,53 @@ multiclass VOP3be_Realtriple< multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : VOP3be_Realtriple<Gen, op, 1>; +multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> { + def _e64_dpp#Gen.Suffix : + VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>; +} + +multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> { + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3a_BITOP3_gfx12<op, ps.Pfl>; + } +} + +multiclass VOP3Only_ScaleSel_Real_gfx1250<bits<10> op> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + def _e64_gfx1250 : + VOP3_Real_Gen<ps, GFX1250Gen>, + VOP3a_ScaleSel_gfx1250<op, ps.Pfl>; +} + +multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>, + VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic, 1>; + defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>; +} + +multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>, + VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + //===----------------------------------------------------------------------===// // VOP3 GFX11 //===----------------------------------------------------------------------===// @@ -2004,6 +2102,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> : multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> : VOP3_Realtriple<GFX1250Gen, op, isSingle>; +multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + +multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>; + multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; @@ -2024,6 +2131,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>; } +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op, + string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>; + defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>; +} + multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -2046,6 +2160,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; +multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>; + +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>; + defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>; +} + multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0, string opName = NAME> : VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 066b392..bd4b75f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2423,6 +2423,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2446,6 +2447,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, !Subtarget->noBTIAtReturnTwice()) GuardWithBTI = AFI->branchTargetEnforcement(); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Determine whether this is a non-secure function call. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) isCmseNSCall = true; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 146fc67..dfa3de3c 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1125,7 +1125,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, const unsigned NumBytes = getFixupKindNumBytes(Kind); unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FullSizeBytes; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 868556b..6dfe846 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1284,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } // Add the R_ARM_NONE fixup at the same position void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name); + visitUsedSymbol(*PersonalitySym); const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); - - visitUsedExpr(*PersonalityRef); - MCFragment *DF = getCurrentFragment(); - DF->addFixup( - MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); + addFixup(PersonalityRef, FK_Data_4); } void ARMELFStreamer::FlushPendingOffset() { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 128cc0b..38444f9 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -398,7 +398,7 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a87b9a2..bed6bc9 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { return; } - // MapDef type may be a struct type or a non-pointer derived type - const DIType *OrigTy = Ty; - while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { - auto Tag = DTy->getTag(); - if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && - Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type) - break; - Ty = DTy->getBaseType(); - } - - const auto *CTy = dyn_cast<DICompositeType>(Ty); - if (!CTy) - return; - - auto Tag = CTy->getTag(); - if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) - return; - - // Visit all struct members to ensure their types are visited. - const DINodeArray Elements = CTy->getElements(); - for (const auto *Element : Elements) { - const auto *MemberType = cast<DIDerivedType>(Element); - const DIType *MemberBaseType = MemberType->getBaseType(); - - // If the member is a composite type, that may indicate the currently - // visited composite type is a wrapper, and the member represents the - // actual map definition. - // In that case, visit the member with `visitMapDefType` instead of - // `visitTypeEntry`, treating it specifically as a map definition rather - // than as a regular composite type. - const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); - if (MemberCTy) { - visitMapDefType(MemberBaseType, TypeId); - } else { - visitTypeEntry(MemberBaseType); + uint32_t TmpId; + switch (Ty->getTag()) { + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_volatile_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_pointer_type: + visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_array_type: + // Visit nested map array and jump to the element type + visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_structure_type: { + // Visit all struct members to ensure their types are visited. + const auto *CTy = cast<DICompositeType>(Ty); + const DINodeArray Elements = CTy->getElements(); + for (const auto *Element : Elements) { + const auto *MemberType = cast<DIDerivedType>(Element); + const DIType *MemberBaseType = MemberType->getBaseType(); + // If the member is a composite type, that may indicate the currently + // visited composite type is a wrapper, and the member represents the + // actual map definition. + // In that case, visit the member with `visitMapDefType` instead of + // `visitTypeEntry`, treating it specifically as a map definition rather + // than as a regular composite type. + const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); + if (MemberCTy) { + visitMapDefType(MemberBaseType, TmpId); + } else { + visitTypeEntry(MemberBaseType); + } } + break; + } + default: + break; } // Visit this type, struct or a const/typedef/volatile/restrict type - visitTypeEntry(OrigTy, TypeId, false, false); + visitTypeEntry(Ty, TypeId, false, false); } /// Read file contents from the actual file or from the source diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index 694d9ea..1bd82fad 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -220,7 +220,7 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index ebdfcaa..a4f5086 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -17,7 +17,6 @@ #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Frontend/HLSL/RootSignatureMetadata.h" -#include "llvm/Frontend/HLSL/RootSignatureValidations.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -111,14 +110,25 @@ analyzeModule(Module &M) { reportError(Ctx, "Root Element is not a metadata node."); continue; } - mcdxbc::RootSignatureDesc RSD; - if (std::optional<uint32_t> Version = extractMdIntValue(RSDefNode, 2)) - RSD.Version = *Version; - else { + std::optional<uint32_t> V = extractMdIntValue(RSDefNode, 2); + if (!V.has_value()) { reportError(Ctx, "Invalid RSDefNode value, expected constant int"); continue; } + llvm::hlsl::rootsig::MetadataParser MDParser(RootElementListNode); + llvm::Expected<mcdxbc::RootSignatureDesc> RSDOrErr = + MDParser.ParseRootSignature(V.value()); + + if (!RSDOrErr) { + handleAllErrors(RSDOrErr.takeError(), [&](ErrorInfoBase &EIB) { + Ctx->emitError(EIB.message()); + }); + continue; + } + + auto &RSD = *RSDOrErr; + // Clang emits the root signature data in dxcontainer following a specific // sequence. First the header, then the root parameters. So the header // offset will always equal to the header size. @@ -127,12 +137,6 @@ analyzeModule(Module &M) { // static sampler offset is calculated when writting dxcontainer. RSD.StaticSamplersOffset = 0u; - hlsl::rootsig::MetadataParser MDParser(RootElementListNode); - - if (MDParser.ParseRootSignature(Ctx, RSD)) { - return RSDMap; - } - RSDMap.insert(std::make_pair(F, RSD)); } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 7d3074b..d5b7a75 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -669,7 +669,7 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // to a real offset before we can use it. uint32_t Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); char *InstAddr = Data.data() + Offset; Value = adjustFixupValue(Kind, FixupValue); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index d96136c..a5bf0e5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2621,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - if (isa<ConstantSDNode>(Op->getOperand(2))) + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + if (isa<ConstantSDNode>(Op2)) return Op; - return SDValue(); + + MVT IdxTy = MVT::getIntegerVT(EltSizeInBits); + MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts); + + if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy)) + return SDValue(); + + SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); + SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + SmallVector<SDValue, 32> RawIndices; + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + + // insert vec, elt, idx + // => + // select (splatidx == {0,1,2...}) ? splatelt : vec + SDValue SelectCC = + DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0); } SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index d9ea88c..858f3d0 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -169,7 +169,7 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) { diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index 5e03903..7ef705d 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -85,7 +85,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Asm->getWriter().recordRelocation(F, Fixup, Target, Value); unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); - assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); // Check that uppper bits are either all zeros or all ones. // Specifically ignore overflow/underflow as long as the leakage is // limited to the lower bits. This is to remain compatible with diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index 29e5bfa..b513503 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -120,7 +120,7 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index ec6b382..881ba8e 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3341,6 +3341,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3397,8 +3398,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned StackSize = CCInfo.getStackSize(); - // Call site info for function parameters tracking. + // Call site info for function parameters tracking and call base type info. MachineFunction::CallSiteInfo CSInfo; + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Check if it's really possible to do a tail call. Restrict it to functions // that are part of this compilation unit. diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 8eec915..ee1ca45 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -391,16 +391,6 @@ void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, } } -void NVPTXInstPrinter::printOffseti32imm(const MCInst *MI, int OpNum, - raw_ostream &O) { - auto &Op = MI->getOperand(OpNum); - assert(Op.isImm() && "Invalid operand"); - if (Op.getImm() != 0) { - O << "+"; - printOperand(MI, OpNum, O); - } -} - void NVPTXInstPrinter::printHexu32imm(const MCInst *MI, int OpNum, raw_ostream &O) { int64_t Imm = MI->getOperand(OpNum).getImm(); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index c3ff346..92155b0 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -46,7 +46,6 @@ public: StringRef Modifier = {}); void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); - void printOffseti32imm(const MCInst *MI, int OpNum, raw_ostream &O); void printHexu32imm(const MCInst *MI, int OpNum, raw_ostream &O); void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O); void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O); diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp index cd40481..a349609 100644 --- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp @@ -56,15 +56,12 @@ static bool traverseMoveUse(MachineInstr &U, const MachineRegisterInfo &MRI, case NVPTX::LD_i16: case NVPTX::LD_i32: case NVPTX::LD_i64: - case NVPTX::LD_i8: case NVPTX::LDV_i16_v2: case NVPTX::LDV_i16_v4: case NVPTX::LDV_i32_v2: case NVPTX::LDV_i32_v4: case NVPTX::LDV_i64_v2: - case NVPTX::LDV_i64_v4: - case NVPTX::LDV_i8_v2: - case NVPTX::LDV_i8_v4: { + case NVPTX::LDV_i64_v4: { LoadInsts.push_back(&U); return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 65e7c56..6068035 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm) { - doMulWide = (OptLevel > CodeGenOptLevel::None); -} + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); @@ -145,18 +143,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStoreVector(N)) return; break; - case NVPTXISD::LoadParam: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadParamV4: - if (tryLoadParam(N)) - return; - break; - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - if (tryStoreParam(N)) - return; - break; case ISD::INTRINSIC_W_CHAIN: if (tryIntrinsicChain(N)) return; @@ -1017,14 +1003,10 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { // Helper function template to reduce amount of boilerplate code for // opcode selection. static std::optional<unsigned> -pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8, - std::optional<unsigned> Opcode_i16, +pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16, std::optional<unsigned> Opcode_i32, std::optional<unsigned> Opcode_i64) { switch (VT) { - case MVT::i1: - case MVT::i8: - return Opcode_i8; case MVT::f16: case MVT::i16: case MVT::bf16: @@ -1092,8 +1074,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { Chain}; const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; - const std::optional<unsigned> Opcode = pickOpcodeForVT( - TargetVT, NVPTX::LD_i8, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64); + const std::optional<unsigned> Opcode = + pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64); if (!Opcode) return false; @@ -1178,17 +1160,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { default: llvm_unreachable("Unexpected opcode"); case NVPTXISD::LoadV2: - Opcode = - pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2, - NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2); + Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2, + NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2); break; case NVPTXISD::LoadV4: - Opcode = - pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4, - NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4); + Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4, + NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */}, + Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */}); break; } @@ -1244,22 +1224,21 @@ bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) { default: llvm_unreachable("Unexpected opcode"); case ISD::LOAD: - Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8, - NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32, - NVPTX::LD_GLOBAL_NC_i64); + Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16, + NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64); break; case NVPTXISD::LoadV2: - Opcode = pickOpcodeForVT( - TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16, - NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64); + Opcode = + pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16, + NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64); break; case NVPTXISD::LoadV4: - Opcode = pickOpcodeForVT( - TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16, - NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64); + Opcode = + pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16, + NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */}, + Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */}, NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */}); break; } @@ -1290,8 +1269,9 @@ bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) { break; } - const MVT::SimpleValueType SelectVT = - MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy; + SDLoc DL(N); + const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits() / NumElts; + const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; // If this is an LDU intrinsic, the address is the third operand. If its an // LDU SD node (from custom vector handling), then its the second operand @@ -1300,32 +1280,28 @@ bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) { SDValue Base, Offset; SelectADDR(Addr, Base, Offset); - SDValue Ops[] = {Base, Offset, LD->getChain()}; + SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()}; std::optional<unsigned> Opcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case ISD::INTRINSIC_W_CHAIN: - Opcode = - pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16, - NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64); + Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16, + NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64); break; case NVPTXISD::LDUV2: - Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8, - NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32, - NVPTX::LDU_GLOBAL_v2i64); + Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16, + NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64); break; case NVPTXISD::LDUV4: - Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8, - NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32, - {/* no v4i64 */}); + Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16, + NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */}); break; } if (!Opcode) return false; - SDLoc DL(N); SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops); ReplaceNode(LD, NVPTXLDU); @@ -1376,8 +1352,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { Chain}; const std::optional<unsigned> Opcode = - pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i8, - NVPTX::ST_i16, NVPTX::ST_i32, NVPTX::ST_i64); + pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16, + NVPTX::ST_i32, NVPTX::ST_i64); if (!Opcode) return false; @@ -1437,16 +1413,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { default: return false; case NVPTXISD::StoreV2: - Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2, - NVPTX::STV_i32_v2, NVPTX::STV_i64_v2); + Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2, + NVPTX::STV_i64_v2); break; case NVPTXISD::StoreV4: - Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4, - NVPTX::STV_i32_v4, NVPTX::STV_i64_v4); + Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, + NVPTX::STV_i64_v4); break; case NVPTXISD::StoreV8: - Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */}, - NVPTX::STV_i32_v8, {/* no v8i64 */}); + Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8, + {/* no v8i64 */}); break; } @@ -1462,267 +1438,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return true; } -bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Offset = Node->getOperand(2); - SDValue Glue = Node->getOperand(3); - SDLoc DL(Node); - MemSDNode *Mem = cast<MemSDNode>(Node); - - unsigned VecSize; - switch (Node->getOpcode()) { - default: - return false; - case NVPTXISD::LoadParam: - VecSize = 1; - break; - case NVPTXISD::LoadParamV2: - VecSize = 2; - break; - case NVPTXISD::LoadParamV4: - VecSize = 4; - break; - } - - EVT EltVT = Node->getValueType(0); - EVT MemVT = Mem->getMemoryVT(); - - std::optional<unsigned> Opcode; - - switch (VecSize) { - default: - return false; - case 1: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, - NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64); - break; - case 2: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, - NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, - NVPTX::LoadParamMemV2I64); - break; - case 4: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/* no v4i64 */}); - break; - } - if (!Opcode) - return false; - - SDVTList VTs; - if (VecSize == 1) { - VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); - } else if (VecSize == 2) { - VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); - } else { - EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(EVTs); - } - - unsigned OffsetVal = Offset->getAsZExtVal(); - - SmallVector<SDValue, 2> Ops( - {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); - return true; -} - -// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) -#define getOpcV2H(ty, opKind0, opKind1) \ - NVPTX::StoreParamV2##ty##_##opKind0##opKind1 - -#define getOpcV2H1(ty, opKind0, isImm1) \ - (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) - -#define getOpcodeForVectorStParamV2(ty, isimm) \ - (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) - -#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ - NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 - -#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ - (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ - : getOpcV4H(ty, opKind0, opKind1, opKind2, r) - -#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ - (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ - : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) - -#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ - (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ - : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) - -#define getOpcodeForVectorStParamV4(ty, isimm) \ - (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ - : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) - -#define getOpcodeForVectorStParam(n, ty, isimm) \ - (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ - : getOpcodeForVectorStParamV4(ty, isimm) - -static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, - unsigned NumElts, - MVT::SimpleValueType MemTy, - SelectionDAG *CurDAG, SDLoc DL) { - // Determine which inputs are registers and immediates make new operators - // with constant values - SmallVector<bool, 4> IsImm(NumElts, false); - for (unsigned i = 0; i < NumElts; i++) { - IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); - if (IsImm[i]) { - SDValue Imm = Ops[i]; - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[i] = Imm; - } - } - - // Get opcode for MemTy, size, and register/immediate operand ordering - switch (MemTy) { - case MVT::i8: - return getOpcodeForVectorStParam(NumElts, I8, IsImm); - case MVT::i16: - return getOpcodeForVectorStParam(NumElts, I16, IsImm); - case MVT::i32: - return getOpcodeForVectorStParam(NumElts, I32, IsImm); - case MVT::i64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(I64, IsImm); - case MVT::f32: - return getOpcodeForVectorStParam(NumElts, F32, IsImm); - case MVT::f64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(F64, IsImm); - - // These cases don't support immediates, just use the all register version - // and generate moves. - case MVT::i1: - return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr - : NVPTX::StoreParamV4I8_rrrr; - case MVT::f16: - case MVT::bf16: - return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr - : NVPTX::StoreParamV4I16_rrrr; - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2i16: - case MVT::v4i8: - return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr - : NVPTX::StoreParamV4I32_rrrr; - default: - llvm_unreachable("Cannot select st.param for unknown MemTy"); - } -} - -bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Param = N->getOperand(1); - unsigned ParamVal = Param->getAsZExtVal(); - SDValue Offset = N->getOperand(2); - unsigned OffsetVal = Offset->getAsZExtVal(); - MemSDNode *Mem = cast<MemSDNode>(N); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); - - // How many elements do we have? - unsigned NumElts; - switch (N->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case NVPTXISD::StoreParam: - NumElts = 1; - break; - case NVPTXISD::StoreParamV2: - NumElts = 2; - break; - case NVPTXISD::StoreParamV4: - NumElts = 4; - break; - } - - // Build vector of operands - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(N->getOperand(i + 3)); - Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - // Determine target opcode - // If we have an i1, use an 8-bit store. The lowering code in - // NVPTXISelLowering will have already emitted an upcast. - std::optional<unsigned> Opcode; - switch (NumElts) { - default: - llvm_unreachable("Unexpected NumElts"); - case 1: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - SDValue Imm = Ops[0]; - if (MemTy != MVT::f16 && MemTy != MVT::bf16 && - (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { - // Convert immediate to target constant - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[0] = Imm; - // Use immediate version of store param - Opcode = - pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i, - NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i); - } else - Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, - NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, - NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r); - if (Opcode == NVPTX::StoreParamI8_r) { - // Fine tune the opcode depending on the size of the operand. - // This helps to avoid creating redundant COPY instructions in - // InstrEmitter::AddRegisterOperand(). - switch (Ops[0].getSimpleValueType().SimpleTy) { - default: - break; - case MVT::i32: - Opcode = NVPTX::StoreParamI8TruncI32_r; - break; - case MVT::i64: - Opcode = NVPTX::StoreParamI8TruncI64_r; - break; - } - } - break; - } - case 2: - case 4: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); - break; - } - } - - SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); - MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); - - ReplaceNode(N, Ret); - return true; -} - /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { @@ -1962,10 +1677,11 @@ bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) { auto API = APF.bitcastToAPInt(); API = API.concat(API); auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32); - return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32i, DL, VT, Const), 0); + return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const), + 0); } auto Const = CurDAG->getTargetConstantFP(APF, DL, VT); - return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16i, DL, VT, Const), 0); + return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0); }; switch (N->getOpcode()) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b99b4ef..9e0f88e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -40,9 +40,6 @@ private: class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; - // If true, generate mul.wide from sext and mul - bool doMulWide; - NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const; bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; @@ -78,8 +75,6 @@ private: bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); - bool tryLoadParam(SDNode *N); - bool tryStoreParam(SDNode *N); bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryBFE(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ddcecc00..65d1be3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, - ISD::STORE}); + ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) MAKE_CASE(NVPTXISD::CALL) - MAKE_CASE(NVPTXISD::LoadParam) - MAKE_CASE(NVPTXISD::LoadParamV2) - MAKE_CASE(NVPTXISD::LoadParamV4) - MAKE_CASE(NVPTXISD::StoreParam) - MAKE_CASE(NVPTXISD::StoreParamV2) - MAKE_CASE(NVPTXISD::StoreParamV4) MAKE_CASE(NVPTXISD::MoveParam) MAKE_CASE(NVPTXISD::UNPACK_VECTOR) MAKE_CASE(NVPTXISD::BUILD_VECTOR) @@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } -static bool adjustElementType(EVT &ElementType) { - switch (ElementType.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::f16: - case MVT::bf16: - ElementType = MVT::i16; - return true; - case MVT::f32: - case MVT::v2f16: - case MVT::v2bf16: - ElementType = MVT::i32; - return true; - case MVT::f64: - ElementType = MVT::i64; - return true; - } -} - -// Use byte-store when the param address of the argument value is unaligned. -// This may happen when the return value is a field of a packed structure. -// -// This is called in LowerCall() when passing the param values. -static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, - uint64_t Offset, EVT ElementType, - SDValue StVal, SDValue &InGlue, - unsigned ArgID, const SDLoc &dl) { - // Bit logic only works on integer types - if (adjustElementType(ElementType)) - StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); - - // Store each byte - SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - // Shift the byte to the last byte position - SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, - DAG.getConstant(i * 8, dl, MVT::i32)); - SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - ShiftVal, InGlue}; - // Trunc store only the last byte by using - // st.param.b8 - // The register type can be larger than b8. - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, - MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); - } - return Chain; -} - -// Use byte-load when the param adress of the returned value is unaligned. -// This may happen when the returned value is a field of a packed structure. -static SDValue -LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, - EVT ElementType, SDValue &InGlue, - SmallVectorImpl<SDValue> &TempProxyRegOps, - const SDLoc &dl) { - // Bit logic only works on integer types - EVT MergedType = ElementType; - adjustElementType(MergedType); - - // Load each byte and construct the whole value. Initial value to 0 - SDValue RetVal = DAG.getConstant(0, dl, MergedType); - // LoadParamMemI8 loads into i16 register only - SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - InGlue}; - // This will be selected to LoadParamMemI8 - SDValue LdVal = - DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, - MVT::i8, MachinePointerInfo(), Align(1)); - SDValue TmpLdVal = LdVal.getValue(0); - Chain = LdVal.getValue(1); - InGlue = LdVal.getValue(2); - - TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, - TmpLdVal.getSimpleValueType(), TmpLdVal); - TempProxyRegOps.push_back(TmpLdVal); - - SDValue CMask = DAG.getConstant(255, dl, MergedType); - SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); - // Need to extend the i16 register to the whole width. - TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); - // Mask off the high bits. Leave only the lower 8bits. - // Do this because we are using loadparam.b8. - TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); - // Shift and merge - TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); - RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); - } - if (ElementType != MergedType) - RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); - - return RetVal; -} - static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func) { if (!Func) @@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc dl = CLI.DL; - SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; + const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; @@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return DAG.getConstant(I, dl, MVT::i32); }; + const unsigned UniqueCallSite = GlobalUniqueCallSite++; + const SDValue CallChain = CLI.Chain; + const SDValue StartChain = + DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl); + SDValue DeclareGlue = StartChain.getValue(1); + + SmallVector<SDValue, 16> CallPrereqs{StartChain}; + + const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) { + // PTX ABI requires integral types to be at least 32 bits in size. FP16 is + // loaded/stored using i16, so it's handled here as well. + const unsigned SizeBits = promoteScalarArgumentSize(Size * 8); + SDValue Declare = + DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(SizeBits), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + + const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align, + unsigned Size) { + SDValue Declare = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + // Variadic arguments. // // Normally, for each argument, we declare a param scalar or a param @@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // After all vararg is processed, 'VAOffset' holds the size of the // vararg byte array. + assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) && + "Non-VarArg function with extra arguments"); - SDValue VADeclareParam; // vararg byte array const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic - unsigned VAOffset = 0; // current offset in the param array + unsigned VAOffset = 0; // current offset in the param array - const unsigned UniqueCallSite = GlobalUniqueCallSite++; - SDValue TempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); - SDValue InGlue = Chain.getValue(1); + const SDValue VADeclareParam = + CLI.Args.size() > FirstVAArg + ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32), + Align(STI.getMaxRequiredAlignment()), 0) + : SDValue(); // Args.size() and Outs.size() need not match. // Outs.size() will be larger @@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = DAG.getNode( - NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), - GetI32(0), InGlue}); - return VADeclareParam; - } - return std::nullopt; - } - if (IsByVal || shouldPassAsArray(Arg.Ty)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(ArgAlign.value()), - GetI32(TypeSize), InGlue}); - } + const SDValue ArgDeclare = [&]() { + if (IsVAArg) + return VADeclareParam; + + if (IsByVal || shouldPassAsArray(Arg.Ty)) + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); - // declare .param .b<size> .param<n>; - - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - const unsigned PromotedSize = - (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) - ? promoteScalarArgumentSize(TypeSize * 8) - : TypeSize * 8; - - return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && + "Only int and float types are supported as non-array arguments"); + + return MakeDeclareScalarParam(ParamSymbol, TypeSize); }(); - if (ArgDeclare) { - Chain = ArgDeclare->getValue(0); - InGlue = ArgDeclare->getValue(1); - } // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const Align PartAlign) { - SDValue StVal; + const MaybeAlign PartAlign) { if (IsByVal) { SDValue Ptr = ArgOutVals[0]; auto MPI = refinePtrAS(Ptr, DAG, DL, *this); SDValue SrcAddr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign); - } else { - StVal = ArgOutVals[I]; - - auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); - if (PromotedVT != StVal.getValueType()) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, - StVal); - } + return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); } + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = - DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - return StVal; + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); }; const auto VectorInfo = @@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar store. In such cases, fall back to byte stores. - if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { - - SDValue StVal = GetStoredValue(J, EltVT, PartAlign); - Chain = LowerUnalignedStoreParam(DAG, Chain, - CurOffset + (IsByVal ? VAOffset : 0), - EltVT, StVal, InGlue, ArgI, dl); - - // LowerUnalignedStoreParam took care of inserting the necessary nodes - // into the SDAG, so just move on to the next element. - J++; - continue; - } + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); if (IsVAArg && !IsByVal) // Align each part of the variadic argument to their type. @@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((IsVAArg || VAOffset == 0) && "VAOffset must be 0 for non-VA args"); - SmallVector<SDValue, 6> StoreOperands{ - Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), - GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; - // Record the values to store. - for (const unsigned K : llvm::seq(NumElts)) - StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign)); - StoreOperands.push_back(InGlue); + const unsigned Offset = + (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); + + SDValue Val; + if (NumElts == 1) { + Val = GetStoredValue(J, EltVT, CurrentAlign); + } else { + SmallVector<SDValue, 8> StoreVals; + for (const unsigned K : llvm::seq(NumElts)) { + SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); + if (ValJ.getValueType().isVector()) + DAG.ExtractVectorElements(ValJ, StoreVals); + else + StoreVals.push_back(ValJ); + } + + EVT VT = EVT::getVectorVT( + *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); + Val = DAG.getBuildVector(VT, dl, StoreVals); } - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); // TODO: We may need to support vector types that can be passed // as scalars in variadic arguments. if (IsVAArg && !IsByVal) { assert(NumElts == 1 && "Vectorization is expected to be disabled for variadics."); + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); } @@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VAOffset += TypeSize; } - GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); - // Handle Result if (!Ins.empty()) { - const SDValue RetDeclare = [&]() { - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (shouldPassAsArray(RetTy)) { - const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(RetAlign.value()), - GetI32(ResultSize / 8), InGlue}); - } - const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); - return DAG.getNode( - NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); - }(); - Chain = RetDeclare.getValue(0); - InGlue = RetDeclare.getValue(1); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSize(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize); + } else { + MakeDeclareScalarParam(RetSymbol, ResultSize); + } } - const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); // Set the size of the vararg param byte array if the callee is a variadic // function and the variadic part is not empty. - if (HasVAArgs) { + if (VADeclareParam) { SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), VADeclareParam.getOperand(2), GetI32(VAOffset), @@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VADeclareParam->getVTList(), DeclareParamOps); } + const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); // If the type of the callsite does not match that of the function, convert // the callsite to an indirect call. const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); @@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // instruction. // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. + const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); std::string Proto = getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); - Chain = DAG.getNode( - NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue}, - {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue}); - InGlue = Chain.getValue(1); + const SDValue PrototypeDeclare = DAG.getNode( + NVPTXISD::CallPrototype, dl, MVT::Other, + {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)}); + CallPrereqs.push_back(PrototypeDeclare); } if (ConvertToIndirectCall) { @@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) - Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), - GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, - GetI32(Proto), InGlue}); - InGlue = Chain.getValue(1); - + /// NumParams, Callee, Proto) + const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs); + const SDValue Call = DAG.getNode( + NVPTXISD::CALL, dl, MVT::Other, + {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), + GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)}); + + SmallVector<SDValue, 16> LoadChains{Call}; SmallVector<SDValue, 16> ProxyRegOps; - // An item of the vector is filled if the element does not need a ProxyReg - // operation on it and should be added to InVals as is. ProxyRegOps and - // ProxyRegTruncates contain empty/none items at the same index. - SmallVector<SDValue, 16> RetElts; - // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` - // to use the values of `LoadParam`s and to be replaced later then - // `CALLSEQ_END` is added. - SmallVector<SDValue, 16> TempProxyRegOps; - - // Generate loads from param memory/moves from registers for result if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; @@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); - EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - - if (TheLoadType != VTs[I]) - EltType = TheLoadType; - - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - } else if (TheLoadType.getSizeInBits() < 16) { - EltType = MVT::i16; - } + for (const unsigned NumElts : VectorInfo) { + const MaybeAlign CurrentAlign = + ExtendIntegerRetVal ? MaybeAlign(std::nullopt) + : commonAlignment(RetAlign, Offsets[I]); - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar load. In such cases, fall back to byte loads. - if (VectorizedSize == 1 && RetTy->isAggregateType() && - EltAlign < DAG.getEVTAlign(TheLoadType)) { - SDValue Ret = LowerUnalignedLoadRetParam( - DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl); - ProxyRegOps.push_back(SDValue()); - RetElts.resize(I); - RetElts.push_back(Ret); - - I++; - continue; - } + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT LoadVT = + ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); - LoadVTs.append({MVT::Other, MVT::Glue}); + const unsigned PackingAmt = + LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - NVPTXISD::NodeType Op; - switch (VectorizedSize) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + const EVT VecVT = NumElts == 1 ? LoadVT + : EVT::getVectorVT(*DAG.getContext(), + LoadVT.getScalarType(), + NumElts * PackingAmt); - SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); - for (const unsigned J : llvm::seq(VectorizedSize)) { - ProxyRegOps.push_back(RetVal.getValue(J)); - } + SDValue R = + DAG.getLoad(VecVT, dl, Call, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - Chain = RetVal.getValue(VectorizedSize); - InGlue = RetVal.getValue(VectorizedSize + 1); + LoadChains.push_back(R.getValue(1)); - I += VectorizedSize; + if (NumElts == 1) + ProxyRegOps.push_back(R); + else + for (const unsigned J : llvm::seq(NumElts)) { + SDValue Elt = DAG.getNode( + LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); + ProxyRegOps.push_back(Elt); + } + I += NumElts; } } - Chain = - DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); - InGlue = Chain.getValue(1); + const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains); + const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite, + UniqueCallSite + 1, SDValue(), dl); // Append ProxyReg instructions to the chain to make sure that `callseq_end` // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. - for (const unsigned I : llvm::seq(ProxyRegOps.size())) { - if (I < RetElts.size() && RetElts[I]) { - InVals.push_back(RetElts[I]); - continue; - } - - SDValue Ret = - DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), - {Chain, ProxyRegOps[I]}); - - const EVT ExpectedVT = Ins[I].VT; - if (!Ret.getValueType().bitsEq(ExpectedVT)) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret); - } + for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) { + SDValue Proxy = + DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg}); + SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl); InVals.push_back(Ret); } - for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), - {Chain, T.getOperand(0)}); - DAG.ReplaceAllUsesWith(T, Repl); - DAG.RemoveDeadNode(T.getNode()); - } - - // set isTailCall to false for now, until we figure out how to express + // set IsTailCall to false for now, until we figure out how to express // tail call optimization in PTX - isTailCall = false; - return Chain; + CLI.IsTailCall = false; + return CallEnd; } SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, @@ -5100,7 +4917,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return SDValue(); auto *LD = cast<MemSDNode>(N); - EVT MemVT = LD->getMemoryVT(); SDLoc DL(LD); // the new opcode after we double the number of operands @@ -5117,10 +4933,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Operands.push_back(DCI.DAG.getIntPtrConstant( cast<LoadSDNode>(LD)->getExtensionType(), DL)); break; - case NVPTXISD::LoadParamV2: - OldNumOutputs = 2; - Opcode = NVPTXISD::LoadParamV4; - break; case NVPTXISD::LoadV2: OldNumOutputs = 2; Opcode = NVPTXISD::LoadV4; @@ -5145,9 +4957,9 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end()); // Create the new load - SDValue NewLoad = - DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs), - Operands, MemVT, LD->getMemOperand()); + SDValue NewLoad = DCI.DAG.getMemIntrinsicNode( + Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(), + LD->getMemOperand()); // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep // the outputs the same. These nodes will be optimized away in later @@ -5189,7 +5001,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); auto *ST = cast<MemSDNode>(N); - EVT MemVT = ElementVT.getVectorElementType(); // The new opcode after we double the number of operands. NVPTXISD::NodeType Opcode; @@ -5198,17 +5009,9 @@ static SDValue combinePackingMovIntoStore(SDNode *N, // Any packed type is legal, so the legalizer will not have lowered // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do // it here. - MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV2; break; - case NVPTXISD::StoreParam: - Opcode = NVPTXISD::StoreParamV2; - break; - case NVPTXISD::StoreParamV2: - Opcode = NVPTXISD::StoreParamV4; - break; case NVPTXISD::StoreV2: - MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV4; break; case NVPTXISD::StoreV4: @@ -5218,7 +5021,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); Opcode = NVPTXISD::StoreV8; break; - case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands return SDValue(); @@ -5260,19 +5062,7 @@ static SDValue combinePackingMovIntoStore(SDNode *N, // Now we replace the store return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands, - MemVT, ST->getMemOperand()); -} - -static SDValue PerformStoreCombineHelper(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - unsigned Front, unsigned Back) { - if (all_of(N->ops().drop_front(Front).drop_back(Back), - [](const SDUse &U) { return U.get()->isUndef(); })) - // Operand 0 is the previous value in the chain. Cannot return EntryToken - // as the previous value will become unused and eliminated later. - return N->getOperand(0); - - return combinePackingMovIntoStore(N, DCI, Front, Back); + ST->getMemoryVT(), ST->getMemOperand()); } static SDValue PerformStoreCombine(SDNode *N, @@ -5280,13 +5070,6 @@ static SDValue PerformStoreCombine(SDNode *N, return combinePackingMovIntoStore(N, DCI, 1, 2); } -static SDValue PerformStoreParamCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Operands from the 3rd to the 2nd last one are the values to be stored. - // {Chain, ArgID, Offset, Val, Glue} - return PerformStoreCombineHelper(N, DCI, 3, 1); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5432,6 +5215,42 @@ static SDValue PerformREMCombine(SDNode *N, return SDValue(); } +// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y) +static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.hasOneUse()) + return SDValue(); + EVT ToVT = N->getValueType(0); + EVT FromVT = Op.getValueType(); + if (!((ToVT == MVT::i32 && FromVT == MVT::i16) || + (ToVT == MVT::i64 && FromVT == MVT::i32))) + return SDValue(); + if (!(Op.getOpcode() == ISD::MUL || + (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1))))) + return SDValue(); + + SDLoc DL(N); + unsigned ExtOpcode = N->getOpcode(); + unsigned Opcode = 0; + if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_SIGNED; + else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_UNSIGNED; + else + return SDValue(); + SDValue RHS = Op.getOperand(1); + if (Op.getOpcode() == ISD::SHL) { + const auto ShiftAmt = Op.getConstantOperandVal(1); + const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, ToVT); + } + return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -5942,6 +5761,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, N->getConstantOperandAPInt(2), N->getConstantOperandVal(3)), SDLoc(N), N->getValueType(0)); + return SDValue(); +} + +// During call lowering we wrap the return values in a ProxyReg node which +// depend on the chain value produced by the completed call. This ensures that +// the full call is emitted in cases where libcalls are used to legalize +// operations. To improve the functioning of other DAG combines we pull all +// operations we can through one of these nodes, ensuring that the ProxyReg +// directly wraps a load. That is: +// +// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0))) +// +static SDValue sinkProxyReg(SDValue R, SDValue Chain, + TargetLowering::DAGCombinerInfo &DCI) { + switch (R.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::BITCAST: { + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V); + return SDValue(); + } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::OR: { + if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI)) + if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B); + return SDValue(); + } + case ISD::Constant: + return R; + case ISD::LOAD: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: { + return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(), + {Chain, R}); + } + case ISD::BUILD_VECTOR: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SmallVector<SDValue, 16> Ops; + for (auto &Op : R->ops()) { + SDValue V = sinkProxyReg(Op, Chain, DCI); + if (!V) + return SDValue(); + Ops.push_back(V); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops); + } + case ISD::EXTRACT_VECTOR_ELT: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R), + R.getValueType(), V, R.getOperand(1)); + return SDValue(); + } + default: + return SDValue(); + } +} + +static SDValue combineProxyReg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + // If the ProxyReg is not wrapping a load, try to pull the operations through + // the ProxyReg. + if (Reg.getOpcode() != ISD::LOAD) { + if (SDValue V = sinkProxyReg(Reg, Chain, DCI)) + return V; + } return SDValue(); } @@ -5958,6 +5857,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return combineADDRSPACECAST(N, DCI); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return combineMulWide(N, DCI, OptLevel); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: @@ -5965,7 +5867,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); case ISD::LOAD: - case NVPTXISD::LoadParamV2: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: return combineUnpackingMovIntoLoad(N, DCI); @@ -5973,6 +5874,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: return combinePRMT(N, DCI, OptLevel); + case NVPTXISD::ProxyReg: + return combineProxyReg(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::SHL: @@ -5980,10 +5883,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SREM: case ISD::UREM: return PerformREMCombine(N, DCI, OptLevel); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -6332,6 +6231,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, Results.push_back(NewValue.getValue(3)); } +static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType()); + + SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT); + SDValue NewProxy = + DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg}); + SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0)); + + Results.push_back(Res); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6349,6 +6264,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::CopyFromReg: ReplaceCopyFromReg_128(N, DAG, Results); return; + case NVPTXISD::ProxyReg: + replaceProxyReg(N, DAG, *this, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 228e2aa..cf72a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -38,7 +38,7 @@ enum NodeType : unsigned { /// This node represents a PTX call instruction. It's operands are as follows: /// /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) CALL, MoveParam, @@ -84,13 +84,7 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - LAST_MEMORY_OPCODE = StoreParamV4, + LAST_MEMORY_OPCODE = StoreV8, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td index 86dcb4a..719be03 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td @@ -11,15 +11,9 @@ // //===----------------------------------------------------------------------===// -// Vector instruction type enum -class VecInstTypeEnum<bits<4> val> { - bits<4> Value=val; -} -def VecNOP : VecInstTypeEnum<0>; - // Generic NVPTX Format -class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern> +class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern = []> : Instruction { field bits<14> Inst; @@ -30,7 +24,6 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern> let Pattern = pattern; // TSFlagFields - bits<4> VecInstType = VecNOP.Value; bit IsLoad = false; bit IsStore = false; @@ -45,7 +38,6 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern> // 2**(2-1) = 2. bits<2> IsSuld = 0; - let TSFlags{3...0} = VecInstType; let TSFlags{4} = IsLoad; let TSFlags{5} = IsStore; let TSFlags{6} = IsTex; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index e218ef1..34fe467 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -35,23 +35,23 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (RegInfo.getRegSizeInBits(*DestRC) != RegInfo.getRegSizeInBits(*SrcRC)) + if (DestRC != SrcRC) report_fatal_error("Copy one register into another with a different width"); unsigned Op; - if (DestRC == &NVPTX::B1RegClass) { - Op = NVPTX::IMOV1r; - } else if (DestRC == &NVPTX::B16RegClass) { - Op = NVPTX::MOV16r; - } else if (DestRC == &NVPTX::B32RegClass) { - Op = NVPTX::IMOV32r; - } else if (DestRC == &NVPTX::B64RegClass) { - Op = NVPTX::IMOV64r; - } else if (DestRC == &NVPTX::B128RegClass) { - Op = NVPTX::IMOV128r; - } else { + if (DestRC == &NVPTX::B1RegClass) + Op = NVPTX::MOV_B1_r; + else if (DestRC == &NVPTX::B16RegClass) + Op = NVPTX::MOV_B16_r; + else if (DestRC == &NVPTX::B32RegClass) + Op = NVPTX::MOV_B32_r; + else if (DestRC == &NVPTX::B64RegClass) + Op = NVPTX::MOV_B64_r; + else if (DestRC == &NVPTX::B128RegClass) + Op = NVPTX::MOV_B128_r; + else llvm_unreachable("Bad register copy"); - } + BuildMI(MBB, I, DL, get(Op), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 442b900..d8047d3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -15,19 +15,8 @@ include "NVPTXInstrFormats.td" let OperandType = "OPERAND_IMMEDIATE" in { def f16imm : Operand<f16>; def bf16imm : Operand<bf16>; - } -// List of vector specific properties -def isVecLD : VecInstTypeEnum<1>; -def isVecST : VecInstTypeEnum<2>; -def isVecBuild : VecInstTypeEnum<3>; -def isVecShuffle : VecInstTypeEnum<4>; -def isVecExtract : VecInstTypeEnum<5>; -def isVecInsert : VecInstTypeEnum<6>; -def isVecDest : VecInstTypeEnum<7>; -def isVecOther : VecInstTypeEnum<15>; - //===----------------------------------------------------------------------===// // NVPTX Operand Definitions. //===----------------------------------------------------------------------===// @@ -125,8 +114,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doRsqrtOpt : Predicate<"doRsqrtOpt()">; -def doMulWide : Predicate<"doMulWide">; - def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; @@ -486,46 +473,28 @@ let hasSideEffects = false in { // takes a CvtMode immediate that defines the conversion mode to use. It can // be CvtNONE to omit a conversion mode. multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { - def _s8 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B16:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s8">, - Requires<Preds>; - def _u8 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B16:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u8">, - Requires<Preds>; - def _s16 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B16:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s16">, - Requires<Preds>; - def _u16 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B16:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u16">, - Requires<Preds>; - def _s32 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B32:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s32">, - Requires<Preds>; - def _u32 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B32:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u32">, - Requires<Preds>; - def _s64 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B64:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s64">, - Requires<Preds>; - def _u64 : - BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B64:$src), (ins CvtMode:$mode), - "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u64">, - Requires<Preds>; + foreach sign = ["s", "u"] in { + def _ # sign # "8" : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B16:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # "." # sign # "8">, + Requires<Preds>; + def _ # sign # "16" : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B16:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # "." # sign # "16">, + Requires<Preds>; + def _ # sign # "32" : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # "." # sign # "32">, + Requires<Preds>; + def _ # sign # "64" : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B64:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # "." # sign # "64">, + Requires<Preds>; + } def _f16 : BasicFlagsNVPTXInst<(outs RC:$dst), (ins B16:$src), (ins CvtMode:$mode), @@ -556,14 +525,12 @@ let hasSideEffects = false in { } // Generate cvts from all types to all types. - defm CVT_s8 : CVT_FROM_ALL<"s8", B16>; - defm CVT_u8 : CVT_FROM_ALL<"u8", B16>; - defm CVT_s16 : CVT_FROM_ALL<"s16", B16>; - defm CVT_u16 : CVT_FROM_ALL<"u16", B16>; - defm CVT_s32 : CVT_FROM_ALL<"s32", B32>; - defm CVT_u32 : CVT_FROM_ALL<"u32", B32>; - defm CVT_s64 : CVT_FROM_ALL<"s64", B64>; - defm CVT_u64 : CVT_FROM_ALL<"u64", B64>; + foreach sign = ["s", "u"] in { + defm CVT_ # sign # "8" : CVT_FROM_ALL<sign # "8", B16>; + defm CVT_ # sign # "16" : CVT_FROM_ALL<sign # "16", B16>; + defm CVT_ # sign # "32" : CVT_FROM_ALL<sign # "32", B32>; + defm CVT_ # sign # "64" : CVT_FROM_ALL<sign # "64", B64>; + } defm CVT_f16 : CVT_FROM_ALL<"f16", B16>; defm CVT_bf16 : CVT_FROM_ALL<"bf16", B16, [hasPTX<78>, hasSM<90>]>; defm CVT_f32 : CVT_FROM_ALL<"f32", B32>; @@ -571,18 +538,12 @@ let hasSideEffects = false in { // These cvts are different from those above: The source and dest registers // are of the same type. - def CVT_INREG_s16_s8 : BasicNVPTXInst<(outs B16:$dst), (ins B16:$src), - "cvt.s16.s8">; - def CVT_INREG_s32_s8 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), - "cvt.s32.s8">; - def CVT_INREG_s32_s16 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), - "cvt.s32.s16">; - def CVT_INREG_s64_s8 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), - "cvt.s64.s8">; - def CVT_INREG_s64_s16 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), - "cvt.s64.s16">; - def CVT_INREG_s64_s32 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), - "cvt.s64.s32">; + def CVT_INREG_s16_s8 : BasicNVPTXInst<(outs B16:$dst), (ins B16:$src), "cvt.s16.s8">; + def CVT_INREG_s32_s8 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "cvt.s32.s8">; + def CVT_INREG_s32_s16 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "cvt.s32.s16">; + def CVT_INREG_s64_s8 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), "cvt.s64.s8">; + def CVT_INREG_s64_s16 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), "cvt.s64.s16">; + def CVT_INREG_s64_s32 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), "cvt.s64.s32">; multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { def _f32 : @@ -784,7 +745,7 @@ defm SUB : I3<"sub.s", sub, commutative = false>; def ADD16x2 : I16x2<"add.s", add>; -// in32 and int64 addition and subtraction with carry-out. +// int32 and int64 addition and subtraction with carry-out. defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>; defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>; @@ -805,17 +766,17 @@ defm UDIV : I3<"div.u", udiv, commutative = false>; defm SREM : I3<"rem.s", srem, commutative = false>; defm UREM : I3<"rem.u", urem, commutative = false>; -// Integer absolute value. NumBits should be one minus the bit width of RC. -// This idiom implements the algorithm at -// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. -multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { - def : BasicNVPTXInst<(outs RC:$dst), (ins RC:$a), - "abs" # SizeName, - [(set T:$dst, (abs T:$a))]>; +foreach t = [I16RT, I32RT, I64RT] in { + def ABS_S # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a), + "abs.s" # t.Size, + [(set t.Ty:$dst, (abs t.Ty:$a))]>; + + def NEG_S # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), + "neg.s" # t.Size, + [(set t.Ty:$dst, (ineg t.Ty:$src))]>; } -defm ABS_16 : ABS<i16, B16, ".s16">; -defm ABS_32 : ABS<i32, B32, ".s32">; -defm ABS_64 : ABS<i64, B64, ".s64">; // Integer min/max. defm SMAX : I3<"max.s", smax, commutative = true>; @@ -832,170 +793,63 @@ def UMIN16x2 : I16x2<"min.u", umin>; // // Wide multiplication // -def MULWIDES64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">; -def MULWIDES64Imm : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">; -def MULWIDES64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">; - -def MULWIDEU64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">; -def MULWIDEU64Imm : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">; -def MULWIDEU64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">; - -def MULWIDES32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">; -def MULWIDES32Imm : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">; -def MULWIDES32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">; - -def MULWIDEU32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">; -def MULWIDEU32Imm : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">; -def MULWIDEU32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">; - -def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; - -// Matchers for signed, unsigned mul.wide ISD nodes. -let Predicates = [doMulWide] in { - def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; - def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; - def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; - def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>; - - def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>; - def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>; - def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>; - def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; -} - -// Predicates used for converting some patterns to mul.wide. -def SInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(32); -}]>; - -def UInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(32); -}]>; -def SInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(16); -}]>; - -def UInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(16); -}]>; - -def IntConst_0_30 : PatLeaf<(imm), [{ - // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(31); -}]>; - -def IntConst_0_14 : PatLeaf<(imm), [{ - // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(15); -}]>; - -def SHL2MUL32 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(32, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); -}]>; +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; +def smul_wide : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>; +def umul_wide : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>; -def SHL2MUL16 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(16, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); -}]>; - -// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -let Predicates = [doMulWide] in { - def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDES64Imm $a, (SHL2MUL32 $b))>; - def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDEU64Imm $a, (SHL2MUL32 $b))>; - - def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDES32Imm $a, (SHL2MUL16 $b))>; - def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDEU32Imm $a, (SHL2MUL16 $b))>; - - // Convert "sign/zero-extend then multiply" to mul.wide. - def : Pat<(mul (sext i32:$a), (sext i32:$b)), - (MULWIDES64 $a, $b)>; - def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>; - - def : Pat<(mul (zext i32:$a), (zext i32:$b)), - (MULWIDEU64 $a, $b)>; - def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>; - def : Pat<(mul (sext i16:$a), (sext i16:$b)), - (MULWIDES32 $a, $b)>; - def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>; - - def : Pat<(mul (zext i16:$a), (zext i16:$b)), - (MULWIDEU32 $a, $b)>; - def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>; +multiclass MULWIDEInst<string suffix, SDPatternOperator op, RegTyInfo big_t, RegTyInfo small_t> { + def suffix # _rr : + BasicNVPTXInst<(outs big_t.RC:$dst), (ins small_t.RC:$a, small_t.RC:$b), + "mul.wide." # suffix, + [(set big_t.Ty:$dst, (op small_t.Ty:$a, small_t.Ty:$b))]>; + def suffix # _ri : + BasicNVPTXInst<(outs big_t.RC:$dst), (ins small_t.RC:$a, small_t.Imm:$b), + "mul.wide." # suffix, + [(set big_t.Ty:$dst, (op small_t.Ty:$a, imm:$b))]>; } +defm MUL_WIDE : MULWIDEInst<"s32", smul_wide, I64RT, I32RT>; +defm MUL_WIDE : MULWIDEInst<"u32", umul_wide, I64RT, I32RT>; +defm MUL_WIDE : MULWIDEInst<"s16", smul_wide, I32RT, I16RT>; +defm MUL_WIDE : MULWIDEInst<"u16", umul_wide, I32RT, I16RT>; + // // Integer multiply-add // -def mul_oneuse : OneUse2<mul>; - -multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> { +multiclass MADInst<string suffix, SDPatternOperator op, RegTyInfo big_t, RegTyInfo small_t> { def rrr: - BasicNVPTXInst<(outs Reg:$dst), - (ins Reg:$a, Reg:$b, Reg:$c), - Ptx, - [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>; - - def rir: - BasicNVPTXInst<(outs Reg:$dst), - (ins Reg:$a, Imm:$b, Reg:$c), - Ptx, - [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>; + BasicNVPTXInst<(outs big_t.RC:$dst), + (ins small_t.RC:$a, small_t.RC:$b, big_t.RC:$c), + "mad." # suffix, + [(set big_t.Ty:$dst, (add (OneUse2<op> small_t.Ty:$a, small_t.Ty:$b), big_t.Ty:$c))]>; def rri: - BasicNVPTXInst<(outs Reg:$dst), - (ins Reg:$a, Reg:$b, Imm:$c), - Ptx, - [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>; + BasicNVPTXInst<(outs big_t.RC:$dst), + (ins small_t.RC:$a, small_t.RC:$b, big_t.Imm:$c), + "mad." # suffix, + [(set big_t.Ty:$dst, (add (OneUse2<op> small_t.Ty:$a, small_t.Ty:$b), imm:$c))]>; + def rir: + BasicNVPTXInst<(outs big_t.RC:$dst), + (ins small_t.RC:$a, small_t.Imm:$b, big_t.RC:$c), + "mad." # suffix, + [(set big_t.Ty:$dst, (add (OneUse2<op> small_t.Ty:$a, imm:$b), big_t.Ty:$c))]>; def rii: - BasicNVPTXInst<(outs Reg:$dst), - (ins Reg:$a, Imm:$b, Imm:$c), - Ptx, - [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>; + BasicNVPTXInst<(outs big_t.RC:$dst), + (ins small_t.RC:$a, small_t.Imm:$b, big_t.Imm:$c), + "mad." # suffix, + [(set big_t.Ty:$dst, (add (OneUse2<op> small_t.Ty:$a, imm:$b), imm:$c))]>; } let Predicates = [hasOptEnabled] in { -defm MAD16 : MAD<"mad.lo.s16", i16, B16, i16imm>; -defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>; -defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>; -} + defm MAD_LO_S16 : MADInst<"lo.s16", mul, I16RT, I16RT>; + defm MAD_LO_S32 : MADInst<"lo.s32", mul, I32RT, I32RT>; + defm MAD_LO_S64 : MADInst<"lo.s64", mul, I64RT, I64RT>; -foreach t = [I16RT, I32RT, I64RT] in { - def NEG_S # t.Size : - BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), - "neg.s" # t.Size, - [(set t.Ty:$dst, (ineg t.Ty:$src))]>; + defm MAD_WIDE_U16 : MADInst<"wide.u16", umul_wide, I32RT, I16RT>; + defm MAD_WIDE_S16 : MADInst<"wide.s16", smul_wide, I32RT, I16RT>; + defm MAD_WIDE_U32 : MADInst<"wide.u32", umul_wide, I64RT, I32RT>; + defm MAD_WIDE_S32 : MADInst<"wide.s32", smul_wide, I64RT, I32RT>; } //----------------------------------- @@ -1106,8 +960,7 @@ def fdiv_approx : PatFrag<(ops node:$a, node:$b), def FRCP32_approx_r : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$b), - (ins FTZFlag:$ftz), + (ins B32:$b), (ins FTZFlag:$ftz), "rcp.approx$ftz.f32", [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>; @@ -1116,14 +969,12 @@ def FRCP32_approx_r : // def FDIV32_approx_rr : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, B32:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, B32:$b), (ins FTZFlag:$ftz), "div.approx$ftz.f32", [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>; def FDIV32_approx_ri : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, f32imm:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz), "div.approx$ftz.f32", [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>; // @@ -1146,14 +997,12 @@ def : Pat<(fdiv_full f32imm_1, f32:$b), // def FDIV32rr : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, B32:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, B32:$b), (ins FTZFlag:$ftz), "div.full$ftz.f32", [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>; def FDIV32ri : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, f32imm:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz), "div.full$ftz.f32", [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>; // @@ -1167,8 +1016,7 @@ def fdiv_ftz : PatFrag<(ops node:$a, node:$b), def FRCP32r_prec : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$b), - (ins FTZFlag:$ftz), + (ins B32:$b), (ins FTZFlag:$ftz), "rcp.rn$ftz.f32", [(set f32:$dst, (fdiv_ftz f32imm_1, f32:$b))]>; // @@ -1176,14 +1024,12 @@ def FRCP32r_prec : // def FDIV32rr_prec : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, B32:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, B32:$b), (ins FTZFlag:$ftz), "div.rn$ftz.f32", [(set f32:$dst, (fdiv_ftz f32:$a, f32:$b))]>; def FDIV32ri_prec : BasicFlagsNVPTXInst<(outs B32:$dst), - (ins B32:$a, f32imm:$b), - (ins FTZFlag:$ftz), + (ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz), "div.rn$ftz.f32", [(set f32:$dst, (fdiv_ftz f32:$a, fpimm:$b))]>; @@ -1262,10 +1108,8 @@ def TANH_APPROX_f32 : // Template for three-arg bitwise operations. Takes three args, Creates .b16, // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. multiclass BITWISE<string OpcStr, SDNode OpNode> { - defm b1 : I3Inst<OpcStr # ".pred", OpNode, I1RT, commutative = true>; - defm b16 : I3Inst<OpcStr # ".b16", OpNode, I16RT, commutative = true>; - defm b32 : I3Inst<OpcStr # ".b32", OpNode, I32RT, commutative = true>; - defm b64 : I3Inst<OpcStr # ".b64", OpNode, I64RT, commutative = true>; + foreach t = [I1RT, I16RT, I32RT, I64RT] in + defm _ # t.PtxType : I3Inst<OpcStr # "." # t.PtxType, OpNode, t, commutative = true>; } defm OR : BITWISE<"or", or>; @@ -1273,48 +1117,40 @@ defm AND : BITWISE<"and", and>; defm XOR : BITWISE<"xor", xor>; // PTX does not support mul on predicates, convert to and instructions -def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>; -def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>; +def : Pat<(mul i1:$a, i1:$b), (AND_predrr $a, $b)>; +def : Pat<(mul i1:$a, imm:$b), (AND_predri $a, imm:$b)>; foreach op = [add, sub] in { - def : Pat<(op i1:$a, i1:$b), (XORb1rr $a, $b)>; - def : Pat<(op i1:$a, imm:$b), (XORb1ri $a, imm:$b)>; + def : Pat<(op i1:$a, i1:$b), (XOR_predrr $a, $b)>; + def : Pat<(op i1:$a, imm:$b), (XOR_predri $a, imm:$b)>; } // These transformations were once reliably performed by instcombine, but thanks // to poison semantics they are no longer safe for LLVM IR, perform them here // instead. -def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>; -def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>; +def : Pat<(select i1:$a, i1:$b, 0), (AND_predrr $a, $b)>; +def : Pat<(select i1:$a, 1, i1:$b), (OR_predrr $a, $b)>; // Lower logical v2i16/v4i8 ops as bitwise ops on b32. foreach vt = [v2i16, v4i8] in { - def : Pat<(or vt:$a, vt:$b), (ORb32rr $a, $b)>; - def : Pat<(xor vt:$a, vt:$b), (XORb32rr $a, $b)>; - def : Pat<(and vt:$a, vt:$b), (ANDb32rr $a, $b)>; + def : Pat<(or vt:$a, vt:$b), (OR_b32rr $a, $b)>; + def : Pat<(xor vt:$a, vt:$b), (XOR_b32rr $a, $b)>; + def : Pat<(and vt:$a, vt:$b), (AND_b32rr $a, $b)>; // The constants get legalized into a bitcast from i32, so that's what we need // to match here. def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))), - (ORb32ri $a, imm:$b)>; + (OR_b32ri $a, imm:$b)>; def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))), - (XORb32ri $a, imm:$b)>; + (XOR_b32ri $a, imm:$b)>; def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))), - (ANDb32ri $a, imm:$b)>; -} - -def NOT1 : BasicNVPTXInst<(outs B1:$dst), (ins B1:$src), - "not.pred", - [(set i1:$dst, (not i1:$src))]>; -def NOT16 : BasicNVPTXInst<(outs B16:$dst), (ins B16:$src), - "not.b16", - [(set i16:$dst, (not i16:$src))]>; -def NOT32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), - "not.b32", - [(set i32:$dst, (not i32:$src))]>; -def NOT64 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), - "not.b64", - [(set i64:$dst, (not i64:$src))]>; + (AND_b32ri $a, imm:$b)>; +} + +foreach t = [I1RT, I16RT, I32RT, I64RT] in + def NOT_ # t.PtxType : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), + "not." # t.PtxType, + [(set t.Ty:$dst, (not t.Ty:$src))]>; // Template for left/right shifts. Takes three operands, // [dest (reg), src (reg), shift (reg or imm)]. @@ -1322,34 +1158,22 @@ def NOT64 : BasicNVPTXInst<(outs B64:$dst), (ins B64:$src), // // This template also defines a 32-bit shift (imm, imm) instruction. multiclass SHIFT<string OpcStr, SDNode OpNode> { - def i64rr : - BasicNVPTXInst<(outs B64:$dst), (ins B64:$a, B32:$b), - OpcStr # "64", - [(set i64:$dst, (OpNode i64:$a, i32:$b))]>; - def i64ri : - BasicNVPTXInst<(outs B64:$dst), (ins B64:$a, i32imm:$b), - OpcStr # "64", - [(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>; - def i32rr : - BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), - OpcStr # "32", - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def i32ri : - BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, i32imm:$b), - OpcStr # "32", - [(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>; - def i32ii : - BasicNVPTXInst<(outs B32:$dst), (ins i32imm:$a, i32imm:$b), - OpcStr # "32", - [(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; - def i16rr : - BasicNVPTXInst<(outs B16:$dst), (ins B16:$a, B32:$b), - OpcStr # "16", - [(set i16:$dst, (OpNode i16:$a, i32:$b))]>; - def i16ri : - BasicNVPTXInst<(outs B16:$dst), (ins B16:$a, i32imm:$b), - OpcStr # "16", - [(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>; + let hasSideEffects = false in { + foreach t = [I64RT, I32RT, I16RT] in { + def t.Size # _rr : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, B32:$b), + OpcStr # t.Size, + [(set t.Ty:$dst, (OpNode t.Ty:$a, i32:$b))]>; + def t.Size # _ri : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, i32imm:$b), + OpcStr # t.Size, + [(set t.Ty:$dst, (OpNode t.Ty:$a, (i32 imm:$b)))]>; + def t.Size # _ii : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, i32imm:$b), + OpcStr # t.Size, + [(set t.Ty:$dst, (OpNode (t.Ty imm:$a), (i32 imm:$b)))]>; + } + } } defm SHL : SHIFT<"shl.b", shl>; @@ -1357,14 +1181,11 @@ defm SRA : SHIFT<"shr.s", sra>; defm SRL : SHIFT<"shr.u", srl>; // Bit-reverse -def BREV32 : - BasicNVPTXInst<(outs B32:$dst), (ins B32:$a), - "brev.b32", - [(set i32:$dst, (bitreverse i32:$a))]>; -def BREV64 : - BasicNVPTXInst<(outs B64:$dst), (ins B64:$a), - "brev.b64", - [(set i64:$dst, (bitreverse i64:$a))]>; +foreach t = [I64RT, I32RT] in + def BREV_ # t.PtxType : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a), + "brev." # t.PtxType, + [(set t.Ty:$dst, (bitreverse t.Ty:$a))]>; // @@ -1516,20 +1337,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN // Byte extraction via shift/trunc/sext -def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), - (CVT_s8_s32 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), +def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>; + +def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>; + +def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), - (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), + (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; + def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), - (BFE_S64rii $s, imm:$o, 8)>; -def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), - (CVT_s8_s64 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), - (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; //----------------------------------- // Comparison instructions (setp, set) @@ -1619,10 +1439,7 @@ def SETP_bf16x2rr : def addr : ComplexPattern<pAny, 2, "SelectADDR">; -def ADDR_base : Operand<pAny> { - let PrintMethod = "printOperand"; -} - +def ADDR_base : Operand<pAny>; def ADDR : Operand<pAny> { let PrintMethod = "printMemOperand"; let MIOperandInfo = (ops ADDR_base, i32imm); @@ -1636,10 +1453,6 @@ def MmaCode : Operand<i32> { let PrintMethod = "printMmaCode"; } -def Offseti32imm : Operand<i32> { - let PrintMethod = "printOffseti32imm"; -} - // Get pointer to local stack. let hasSideEffects = false in { def MOV_DEPOT_ADDR : NVPTXInst<(outs B32:$d), (ins i32imm:$num), @@ -1651,33 +1464,31 @@ let hasSideEffects = false in { // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp let hasSideEffects = false, isAsCheapAsAMove = true in { - // Class for register-to-register moves - class MOVr<RegisterClass RC, string OpStr> : - BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), - "mov." # OpStr>; - - // Class for immediate-to-register moves - class MOVi<RegisterClass RC, string OpStr, ValueType VT, Operand IMMType, SDNode ImmNode> : - BasicNVPTXInst<(outs RC:$dst), (ins IMMType:$src), - "mov." # OpStr, - [(set VT:$dst, ImmNode:$src)]>; -} + let isMoveReg = true in + class MOVr<RegisterClass RC, string OpStr> : + BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>; -def IMOV1r : MOVr<B1, "pred">; -def MOV16r : MOVr<B16, "b16">; -def IMOV32r : MOVr<B32, "b32">; -def IMOV64r : MOVr<B64, "b64">; -def IMOV128r : MOVr<B128, "b128">; + let isMoveImm = true in + class MOVi<RegTyInfo t, string suffix> : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src), + "mov." # suffix, + [(set t.Ty:$dst, t.ImmNode:$src)]>; +} +def MOV_B1_r : MOVr<B1, "pred">; +def MOV_B16_r : MOVr<B16, "b16">; +def MOV_B32_r : MOVr<B32, "b32">; +def MOV_B64_r : MOVr<B64, "b64">; +def MOV_B128_r : MOVr<B128, "b128">; -def IMOV1i : MOVi<B1, "pred", i1, i1imm, imm>; -def IMOV16i : MOVi<B16, "b16", i16, i16imm, imm>; -def IMOV32i : MOVi<B32, "b32", i32, i32imm, imm>; -def IMOV64i : MOVi<B64, "b64", i64, i64imm, imm>; -def FMOV16i : MOVi<B16, "b16", f16, f16imm, fpimm>; -def BFMOV16i : MOVi<B16, "b16", bf16, bf16imm, fpimm>; -def FMOV32i : MOVi<B32, "b32", f32, f32imm, fpimm>; -def FMOV64i : MOVi<B64, "b64", f64, f64imm, fpimm>; +def MOV_B1_i : MOVi<I1RT, "pred">; +def MOV_B16_i : MOVi<I16RT, "b16">; +def MOV_B32_i : MOVi<I32RT, "b32">; +def MOV_B64_i : MOVi<I64RT, "b64">; +def MOV_F16_i : MOVi<F16RT, "b16">; +def MOV_BF16_i : MOVi<BF16RT, "b16">; +def MOV_F32_i : MOVi<F32RT, "b32">; +def MOV_F64_i : MOVi<F64RT, "b64">; def to_tglobaladdr : SDNodeXForm<globaladdr, [{ @@ -1695,11 +1506,11 @@ def to_tframeindex : SDNodeXForm<frameindex, [{ return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0)); }]>; -def : Pat<(i32 globaladdr:$dst), (IMOV32i (to_tglobaladdr $dst))>; -def : Pat<(i64 globaladdr:$dst), (IMOV64i (to_tglobaladdr $dst))>; +def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>; +def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>; -def : Pat<(i32 externalsym:$dst), (IMOV32i (to_texternsym $dst))>; -def : Pat<(i64 externalsym:$dst), (IMOV64i (to_texternsym $dst))>; +def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>; +def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr), @@ -1713,56 +1524,39 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>; //----------------------------------- // Comparison and Selection //----------------------------------- +// TODO: These patterns seem very specific and brittle. We should try to find +// a more general solution. def cond_signed : PatLeaf<(cond), [{ return isSignedIntSetCC(N->get()); }]>; -def cond_not_signed : PatLeaf<(cond), [{ - return !isSignedIntSetCC(N->get()); -}]>; +// A 16-bit signed comparison of sign-extended byte extracts can be converted +// to 32-bit comparison if we change the PRMT to sign-extend the extracted +// bytes. +def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), + (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), + cond_signed:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE), + (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE), + (cond2cc $cc))>; + +// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit +// comparison because we know that the truncate is just trancating off zeros +// and that the most-significant byte is also zeros so the meaning of signed and +// unsigned comparisons will not be changed. +def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), + cond:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -// comparisons of i8 extracted with PRMT as i32 -// It's faster to do comparison directly on i32 extracted by PRMT, -// instead of the long conversion and sign extending. -def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)), - (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), - (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_not_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1774,104 +1568,20 @@ def declare_array_param : def declare_scalar_param : SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; def proxy_reg : SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) def SDTCallProfile : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<5, i32>]>; -def call : - SDNode<"NVPTXISD::CALL", SDTCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -let mayLoad = true in { - class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), - []>; - - class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0$b];"), []>; - - class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins Offseti32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), - []>; -} - -let mayStore = true in { - - multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { - foreach op = [IMMType, regclass] in - if !or(support_imm, !isa<NVPTXRegClass>(op)) then - def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") - : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, Offseti32imm:$b), - "st.param" # opstr # " \t[param$a$b], $val;", - []>; - } - - multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, - i32imm:$a, Offseti32imm:$b), - "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", - []>; - } - - multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - foreach op3 = [IMMType, regclass] in - foreach op4 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - # !if(!isa<NVPTXRegClass>(op3), "r", "i") - # !if(!isa<NVPTXRegClass>(op4), "r", "i") - - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, Offseti32imm:$b), - "st.param.v4" # opstr # - " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", - []>; - } -} +def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, -/// NumParams, Callee, Proto, InGlue) +/// NumParams, Callee, Proto) def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; } @@ -1908,43 +1618,6 @@ foreach is_convergent = [0, 1] in { (call_uni_inst $addr, imm:$rets, imm:$params)>; } -def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">; -def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">; -def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">; - -defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">; -defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">; -defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">; -defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">; - -defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>; -defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>; - -defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">; -defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">; -defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">; -defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">; - -defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">; -defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">; -defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">; - -defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">; -defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">; - -defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">; -defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">; - -defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">; - def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), ".param .align $align .b8 \t$a[$size];", []>; @@ -1957,6 +1630,18 @@ def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size), def : Pat<(declare_scalar_param externalsym:$a, imm:$size), (DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>; +// Call prototype wrapper, this is a dummy instruction that just prints it's +// operand which is string defining the prototype. +def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def ProtoIdent : Operand<i32> { let PrintMethod = "printProtoIdent"; } +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; + + foreach t = [I32RT, I64RT] in { defvar inst_name = "MOV" # t.Size # "_PARAM"; def inst_name : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), "mov.b" # t.Size>; @@ -1976,6 +1661,32 @@ defm ProxyRegB16 : ProxyRegInst<"b16", B16>; defm ProxyRegB32 : ProxyRegInst<"b32", B32>; defm ProxyRegB64 : ProxyRegInst<"b64", B64>; + +// Callseq start and end + +// Note: these nodes are marked as SDNPMayStore and SDNPMayLoad because +// they define the scope in which the declared params may be used. Therefore +// we add these flags to ensure ld.param and st.param are not sunk or hoisted +// out of that scope. + +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; + +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\{ // callseq $amt1, $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + // // Load / Store Handling // @@ -1988,7 +1699,6 @@ class LD<NVPTXRegClass regclass> "\t$dst, [$addr];", []>; let mayLoad=1, hasSideEffects=0 in { - def LD_i8 : LD<B16>; def LD_i16 : LD<B16>; def LD_i32 : LD<B32>; def LD_i64 : LD<B64>; @@ -2004,7 +1714,6 @@ class ST<DAGOperand O> " \t[$addr], $src;", []>; let mayStore=1, hasSideEffects=0 in { - def ST_i8 : ST<RI16>; def ST_i16 : ST<RI16>; def ST_i32 : ST<RI32>; def ST_i64 : ST<RI64>; @@ -2037,7 +1746,6 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> { "[$addr];", []>; } let mayLoad=1, hasSideEffects=0 in { - defm LDV_i8 : LD_VEC<B16>; defm LDV_i16 : LD_VEC<B16>; defm LDV_i32 : LD_VEC<B32, support_v8 = true>; defm LDV_i64 : LD_VEC<B64>; @@ -2071,7 +1779,6 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> { } let mayStore=1, hasSideEffects=0 in { - defm STV_i8 : ST_VEC<RI16>; defm STV_i16 : ST_VEC<RI16>; defm STV_i32 : ST_VEC<RI32, support_v8 = true>; defm STV_i64 : ST_VEC<RI64>; @@ -2241,14 +1948,14 @@ def : Pat<(i64 (anyext i32:$a)), (CVT_u64_u32 $a, CvtNONE)>; // truncate i64 def : Pat<(i32 (trunc i64:$a)), (CVT_u32_u64 $a, CvtNONE)>; def : Pat<(i16 (trunc i64:$a)), (CVT_u16_u64 $a, CvtNONE)>; -def : Pat<(i1 (trunc i64:$a)), (SETP_i64ri (ANDb64ri $a, 1), 0, CmpNE)>; +def : Pat<(i1 (trunc i64:$a)), (SETP_i64ri (AND_b64ri $a, 1), 0, CmpNE)>; // truncate i32 def : Pat<(i16 (trunc i32:$a)), (CVT_u16_u32 $a, CvtNONE)>; -def : Pat<(i1 (trunc i32:$a)), (SETP_i32ri (ANDb32ri $a, 1), 0, CmpNE)>; +def : Pat<(i1 (trunc i32:$a)), (SETP_i32ri (AND_b32ri $a, 1), 0, CmpNE)>; // truncate i16 -def : Pat<(i1 (trunc i16:$a)), (SETP_i16ri (ANDb16ri $a, 1), 0, CmpNE)>; +def : Pat<(i1 (trunc i16:$a)), (SETP_i16ri (AND_b16ri $a, 1), 0, CmpNE)>; // sext_inreg def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>; @@ -2492,52 +2199,20 @@ defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; //----------------------------------- let isTerminator=1 in { - let isReturn=1, isBarrier=1 in + let isReturn=1, isBarrier=1 in def Return : BasicNVPTXInst<(outs), (ins), "ret", [(retglue)]>; - let isBranch=1 in - def CBranch : NVPTXInst<(outs), (ins B1:$a, brtarget:$target), + let isBranch=1 in { + def CBranch : NVPTXInst<(outs), (ins B1:$a, brtarget:$target), "@$a bra \t$target;", [(brcond i1:$a, bb:$target)]>; - let isBranch=1 in - def CBranchOther : NVPTXInst<(outs), (ins B1:$a, brtarget:$target), - "@!$a bra \t$target;", []>; - let isBranch=1, isBarrier=1 in + let isBarrier=1 in def GOTO : BasicNVPTXInst<(outs), (ins brtarget:$target), - "bra.uni", [(br bb:$target)]>; + "bra.uni", [(br bb:$target)]>; + } } -def : Pat<(brcond i32:$a, bb:$target), - (CBranch (SETP_i32ri $a, 0, CmpNE), bb:$target)>; - -// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a -// conditional branch if the target block is the next block so that the code -// can fall through to the target block. The inversion is done by 'xor -// condition, 1', which will be translated to (setne condition, -1). Since ptx -// supports '@!pred bra target', we should use it. -def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target), - (CBranchOther $a, bb:$target)>; - -// Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; -def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPSideEffect]>; - -def Callseq_Start : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\{ // callseq $amt1, $amt2", - [(callseq_start timm:$amt1, timm:$amt2)]>; -def Callseq_End : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\} // callseq $amt1", - [(callseq_end timm:$amt1, timm:$amt2)]>; // trap instruction def trapinst : BasicNVPTXInst<(outs), (ins), "trap", [(trap)]>, Requires<[noPTXASUnreachableBug]>; @@ -2547,18 +2222,6 @@ def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[ // brkpt instruction def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>; -// Call prototype wrapper -def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def CallPrototype : - SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def ProtoIdent : Operand<i32> { - let PrintMethod = "printProtoIdent"; -} -def CALL_PROTOTYPE : - NVPTXInst<(outs), (ins ProtoIdent:$ident), - "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; - def SDTDynAllocaOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 0a00220..d337192 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -243,63 +243,82 @@ foreach sync = [false, true] in { } // vote.{all,any,uni,ballot} -multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - def : BasicNVPTXInst<(outs regclass:$dest), (ins B1:$pred), - "vote." # mode, - [(set regclass:$dest, (IntOp i1:$pred))]>, - Requires<[hasPTX<60>, hasSM<30>]>; -} +let Predicates = [hasPTX<60>, hasSM<30>] in { + multiclass VOTE<string mode, RegTyInfo t, Intrinsic op> { + def : BasicNVPTXInst<(outs t.RC:$dest), (ins B1:$pred), + "vote." # mode # "." # t.PtxType, + [(set t.Ty:$dest, (op i1:$pred))]>; + } -defm VOTE_ALL : VOTE<B1, "all.pred", int_nvvm_vote_all>; -defm VOTE_ANY : VOTE<B1, "any.pred", int_nvvm_vote_any>; -defm VOTE_UNI : VOTE<B1, "uni.pred", int_nvvm_vote_uni>; -defm VOTE_BALLOT : VOTE<B32, "ballot.b32", int_nvvm_vote_ballot>; + defm VOTE_ALL : VOTE<"all", I1RT, int_nvvm_vote_all>; + defm VOTE_ANY : VOTE<"any", I1RT, int_nvvm_vote_any>; + defm VOTE_UNI : VOTE<"uni", I1RT, int_nvvm_vote_uni>; + defm VOTE_BALLOT : VOTE<"ballot", I32RT, int_nvvm_vote_ballot>; + + // vote.sync.{all,any,uni,ballot} + multiclass VOTE_SYNC<string mode, RegTyInfo t, Intrinsic op> { + def i : BasicNVPTXInst<(outs t.RC:$dest), (ins B1:$pred, i32imm:$mask), + "vote.sync." # mode # "." # t.PtxType, + [(set t.Ty:$dest, (op imm:$mask, i1:$pred))]>; + def r : BasicNVPTXInst<(outs t.RC:$dest), (ins B1:$pred, B32:$mask), + "vote.sync." # mode # "." # t.PtxType, + [(set t.Ty:$dest, (op i32:$mask, i1:$pred))]>; + } -// vote.sync.{all,any,uni,ballot} -multiclass VOTE_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - def i : BasicNVPTXInst<(outs regclass:$dest), (ins B1:$pred, i32imm:$mask), - "vote.sync." # mode, - [(set regclass:$dest, (IntOp imm:$mask, i1:$pred))]>, - Requires<[hasPTX<60>, hasSM<30>]>; - def r : BasicNVPTXInst<(outs regclass:$dest), (ins B1:$pred, B32:$mask), - "vote.sync." # mode, - [(set regclass:$dest, (IntOp i32:$mask, i1:$pred))]>, - Requires<[hasPTX<60>, hasSM<30>]>; + defm VOTE_SYNC_ALL : VOTE_SYNC<"all", I1RT, int_nvvm_vote_all_sync>; + defm VOTE_SYNC_ANY : VOTE_SYNC<"any", I1RT, int_nvvm_vote_any_sync>; + defm VOTE_SYNC_UNI : VOTE_SYNC<"uni", I1RT, int_nvvm_vote_uni_sync>; + defm VOTE_SYNC_BALLOT : VOTE_SYNC<"ballot", I32RT, int_nvvm_vote_ballot_sync>; } - -defm VOTE_SYNC_ALL : VOTE_SYNC<B1, "all.pred", int_nvvm_vote_all_sync>; -defm VOTE_SYNC_ANY : VOTE_SYNC<B1, "any.pred", int_nvvm_vote_any_sync>; -defm VOTE_SYNC_UNI : VOTE_SYNC<B1, "uni.pred", int_nvvm_vote_uni_sync>; -defm VOTE_SYNC_BALLOT : VOTE_SYNC<B32, "ballot.b32", int_nvvm_vote_ballot_sync>; - // elect.sync +let Predicates = [hasPTX<80>, hasSM<90>] in { def INT_ELECT_SYNC_I : BasicNVPTXInst<(outs B32:$dest, B1:$pred), (ins i32imm:$mask), "elect.sync", - [(set i32:$dest, i1:$pred, (int_nvvm_elect_sync imm:$mask))]>, - Requires<[hasPTX<80>, hasSM<90>]>; + [(set i32:$dest, i1:$pred, (int_nvvm_elect_sync imm:$mask))]>; def INT_ELECT_SYNC_R : BasicNVPTXInst<(outs B32:$dest, B1:$pred), (ins B32:$mask), "elect.sync", - [(set i32:$dest, i1:$pred, (int_nvvm_elect_sync i32:$mask))]>, - Requires<[hasPTX<80>, hasSM<90>]>; + [(set i32:$dest, i1:$pred, (int_nvvm_elect_sync i32:$mask))]>; +} + +let Predicates = [hasPTX<60>, hasSM<70>] in { + multiclass MATCH_ANY_SYNC<Intrinsic op, RegTyInfo t> { + def ii : BasicNVPTXInst<(outs B32:$dest), (ins t.Imm:$value, i32imm:$mask), + "match.any.sync." # t.PtxType, + [(set i32:$dest, (op imm:$mask, imm:$value))]>; + def ir : BasicNVPTXInst<(outs B32:$dest), (ins t.Imm:$value, B32:$mask), + "match.any.sync." # t.PtxType, + [(set i32:$dest, (op i32:$mask, imm:$value))]>; + def ri : BasicNVPTXInst<(outs B32:$dest), (ins t.RC:$value, i32imm:$mask), + "match.any.sync." # t.PtxType, + [(set i32:$dest, (op imm:$mask, t.Ty:$value))]>; + def rr : BasicNVPTXInst<(outs B32:$dest), (ins t.RC:$value, B32:$mask), + "match.any.sync." # t.PtxType, + [(set i32:$dest, (op i32:$mask, t.Ty:$value))]>; + } -multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, - Operand ImmOp> { - def ii : BasicNVPTXInst<(outs B32:$dest), (ins ImmOp:$value, i32imm:$mask), - "match.any.sync." # ptxtype, - [(set i32:$dest, (IntOp imm:$mask, imm:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def ir : BasicNVPTXInst<(outs B32:$dest), (ins ImmOp:$value, B32:$mask), - "match.any.sync." # ptxtype, - [(set i32:$dest, (IntOp i32:$mask, imm:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def ri : BasicNVPTXInst<(outs B32:$dest), (ins regclass:$value, i32imm:$mask), - "match.any.sync." # ptxtype, - [(set i32:$dest, (IntOp imm:$mask, regclass:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def rr : BasicNVPTXInst<(outs B32:$dest), (ins regclass:$value, B32:$mask), - "match.any.sync." # ptxtype, - [(set i32:$dest, (IntOp i32:$mask, regclass:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; + defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<int_nvvm_match_any_sync_i32, I32RT>; + defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<int_nvvm_match_any_sync_i64, I64RT>; + + multiclass MATCH_ALLP_SYNC<RegTyInfo t, Intrinsic op> { + def ii : BasicNVPTXInst<(outs B32:$dest, B1:$pred), + (ins t.Imm:$value, i32imm:$mask), + "match.all.sync." # t.PtxType, + [(set i32:$dest, i1:$pred, (op imm:$mask, imm:$value))]>; + def ir : BasicNVPTXInst<(outs B32:$dest, B1:$pred), + (ins t.Imm:$value, B32:$mask), + "match.all.sync." # t.PtxType, + [(set i32:$dest, i1:$pred, (op i32:$mask, imm:$value))]>; + def ri : BasicNVPTXInst<(outs B32:$dest, B1:$pred), + (ins t.RC:$value, i32imm:$mask), + "match.all.sync." # t.PtxType, + [(set i32:$dest, i1:$pred, (op imm:$mask, t.Ty:$value))]>; + def rr : BasicNVPTXInst<(outs B32:$dest, B1:$pred), + (ins t.RC:$value, B32:$mask), + "match.all.sync." # t.PtxType, + [(set i32:$dest, i1:$pred, (op i32:$mask, t.Ty:$value))]>; + } + defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<I32RT, int_nvvm_match_all_sync_i32p>; + defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<I64RT, int_nvvm_match_all_sync_i64p>; } // activemask.b32 @@ -308,39 +327,6 @@ def ACTIVEMASK : BasicNVPTXInst<(outs B32:$dest), (ins), [(set i32:$dest, (int_nvvm_activemask))]>, Requires<[hasPTX<62>, hasSM<30>]>; -defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<B32, "b32", int_nvvm_match_any_sync_i32, - i32imm>; -defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<B64, "b64", int_nvvm_match_any_sync_i64, - i64imm>; - -multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, - Operand ImmOp> { - def ii : BasicNVPTXInst<(outs B32:$dest, B1:$pred), - (ins ImmOp:$value, i32imm:$mask), - "match.all.sync." # ptxtype, - [(set i32:$dest, i1:$pred, (IntOp imm:$mask, imm:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def ir : BasicNVPTXInst<(outs B32:$dest, B1:$pred), - (ins ImmOp:$value, B32:$mask), - "match.all.sync." # ptxtype, - [(set i32:$dest, i1:$pred, (IntOp i32:$mask, imm:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def ri : BasicNVPTXInst<(outs B32:$dest, B1:$pred), - (ins regclass:$value, i32imm:$mask), - "match.all.sync." # ptxtype, - [(set i32:$dest, i1:$pred, (IntOp imm:$mask, regclass:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; - def rr : BasicNVPTXInst<(outs B32:$dest, B1:$pred), - (ins regclass:$value, B32:$mask), - "match.all.sync." # ptxtype, - [(set i32:$dest, i1:$pred, (IntOp i32:$mask, regclass:$value))]>, - Requires<[hasPTX<60>, hasSM<70>]>; -} -defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<B32, "b32", int_nvvm_match_all_sync_i32p, - i32imm>; -defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<B64, "b64", int_nvvm_match_all_sync_i64p, - i64imm>; - multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> { def : BasicNVPTXInst<(outs B32:$dst), (ins B32:$src, B32:$mask), "redux.sync." # BinOp # "." # PTXType, @@ -381,24 +367,20 @@ defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", ".abs", ".NaN">; //----------------------------------- // Explicit Memory Fence Functions //----------------------------------- -class MEMBAR<string StrOp, Intrinsic IntOP> : - BasicNVPTXInst<(outs), (ins), - StrOp, [(IntOP)]>; +class NullaryInst<string StrOp, Intrinsic IntOP> : + BasicNVPTXInst<(outs), (ins), StrOp, [(IntOP)]>; -def INT_MEMBAR_CTA : MEMBAR<"membar.cta", int_nvvm_membar_cta>; -def INT_MEMBAR_GL : MEMBAR<"membar.gl", int_nvvm_membar_gl>; -def INT_MEMBAR_SYS : MEMBAR<"membar.sys", int_nvvm_membar_sys>; +def INT_MEMBAR_CTA : NullaryInst<"membar.cta", int_nvvm_membar_cta>; +def INT_MEMBAR_GL : NullaryInst<"membar.gl", int_nvvm_membar_gl>; +def INT_MEMBAR_SYS : NullaryInst<"membar.sys", int_nvvm_membar_sys>; def INT_FENCE_SC_CLUSTER: - MEMBAR<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, + NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; // Proxy fence (uni-directional) -// fence.proxy.tensormap.release variants - class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> : - BasicNVPTXInst<(outs), (ins), - "fence.proxy.tensormap::generic.release." # Scope, [(Intr)]>, + NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>, Requires<[hasPTX<83>, hasSM<90>]>; def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA: @@ -488,35 +470,31 @@ defm CP_ASYNC_CG_SHARED_GLOBAL_16 : CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16, int_nvvm_cp_async_cg_shared_global_16_s>; -def CP_ASYNC_COMMIT_GROUP : - BasicNVPTXInst<(outs), (ins), "cp.async.commit_group", [(int_nvvm_cp_async_commit_group)]>, - Requires<[hasPTX<70>, hasSM<80>]>; +let Predicates = [hasPTX<70>, hasSM<80>] in { + def CP_ASYNC_COMMIT_GROUP : + NullaryInst<"cp.async.commit_group", int_nvvm_cp_async_commit_group>; -def CP_ASYNC_WAIT_GROUP : - BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group", - [(int_nvvm_cp_async_wait_group timm:$n)]>, - Requires<[hasPTX<70>, hasSM<80>]>; + def CP_ASYNC_WAIT_GROUP : + BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group", + [(int_nvvm_cp_async_wait_group timm:$n)]>; -def CP_ASYNC_WAIT_ALL : - BasicNVPTXInst<(outs), (ins), "cp.async.wait_all", - [(int_nvvm_cp_async_wait_all)]>, - Requires<[hasPTX<70>, hasSM<80>]>; + def CP_ASYNC_WAIT_ALL : + NullaryInst<"cp.async.wait_all", int_nvvm_cp_async_wait_all>; +} -// cp.async.bulk variants of the commit/wait group -def CP_ASYNC_BULK_COMMIT_GROUP : - BasicNVPTXInst<(outs), (ins), "cp.async.bulk.commit_group", - [(int_nvvm_cp_async_bulk_commit_group)]>, - Requires<[hasPTX<80>, hasSM<90>]>; +let Predicates = [hasPTX<80>, hasSM<90>] in { + // cp.async.bulk variants of the commit/wait group + def CP_ASYNC_BULK_COMMIT_GROUP : + NullaryInst<"cp.async.bulk.commit_group", int_nvvm_cp_async_bulk_commit_group>; -def CP_ASYNC_BULK_WAIT_GROUP : - BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group", - [(int_nvvm_cp_async_bulk_wait_group timm:$n)]>, - Requires<[hasPTX<80>, hasSM<90>]>; + def CP_ASYNC_BULK_WAIT_GROUP : + BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group", + [(int_nvvm_cp_async_bulk_wait_group timm:$n)]>; -def CP_ASYNC_BULK_WAIT_GROUP_READ : - BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read", - [(int_nvvm_cp_async_bulk_wait_group_read timm:$n)]>, - Requires<[hasPTX<80>, hasSM<90>]>; + def CP_ASYNC_BULK_WAIT_GROUP_READ : + BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read", + [(int_nvvm_cp_async_bulk_wait_group_read timm:$n)]>; +} //------------------------------ // TMA Async Bulk Copy Functions @@ -974,33 +952,30 @@ defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", //Prefetch and Prefetchu -class PREFETCH_INTRS<string InstName> : - BasicNVPTXInst<(outs), (ins ADDR:$addr), - InstName, - [(!cast<Intrinsic>(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; - +let Predicates = [hasPTX<80>, hasSM<90>] in { + class PREFETCH_INTRS<string InstName> : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, + [(!cast<Intrinsic>(!strconcat("int_nvvm_", + !subst(".", "_", InstName))) addr:$addr)]>; -def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; -def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; -def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; -def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; -def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; -def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; + def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; + def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; + def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; + def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; + def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; + def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; -def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; + def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_normal", + [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>; -def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; + def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_last", + [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>; - -def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; + def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +} //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS<string addrspace> : @@ -1031,99 +1006,82 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">; // MBarrier Functions //----------------------------------- -multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$count), - "mbarrier.init" # AddrSpace # ".b64", - [(Intrin addr:$addr, i32:$count)]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>; -defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared", - int_nvvm_mbarrier_init_shared>; - -multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "mbarrier.inval" # AddrSpace # ".b64", - [(Intrin addr:$addr)]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>; -defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared", - int_nvvm_mbarrier_inval_shared>; - -multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs B64:$state), (ins ADDR:$addr), - "mbarrier.arrive" # AddrSpace # ".b64", - [(set i64:$state, (Intrin addr:$addr))]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>; -defm MBARRIER_ARRIVE_SHARED : - MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>; - -multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs B64:$state), - (ins ADDR:$addr, B32:$count), - "mbarrier.arrive.noComplete" # AddrSpace # ".b64", - [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_ARRIVE_NOCOMPLETE : - MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>; -defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED : - MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>; - -multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs B64:$state), (ins ADDR:$addr), - "mbarrier.arrive_drop" # AddrSpace # ".b64", - [(set i64:$state, (Intrin addr:$addr))]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_ARRIVE_DROP : - MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>; -defm MBARRIER_ARRIVE_DROP_SHARED : - MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>; - -multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs B64:$state), - (ins ADDR:$addr, B32:$count), - "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64", - [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, - Requires<[hasPTX<70>, hasSM<80>]>; -} - -defm MBARRIER_ARRIVE_DROP_NOCOMPLETE : - MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>; -defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED : - MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared", - int_nvvm_mbarrier_arrive_drop_noComplete_shared>; - -multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> { - def "" : BasicNVPTXInst<(outs B1:$res), (ins ADDR:$addr, B64:$state), - "mbarrier.test_wait" # AddrSpace # ".b64", - [(set i1:$res, (Intrin addr:$addr, i64:$state))]>, - Requires<[hasPTX<70>, hasSM<80>]>; +let Predicates = [hasPTX<70>, hasSM<80>] in { + class MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$count), + "mbarrier.init" # AddrSpace # ".b64", + [(Intrin addr:$addr, i32:$count)]>; + + def MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>; + def MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared", + int_nvvm_mbarrier_init_shared>; + + class MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + "mbarrier.inval" # AddrSpace # ".b64", + [(Intrin addr:$addr)]>; + + def MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>; + def MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared", + int_nvvm_mbarrier_inval_shared>; + + class MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs B64:$state), (ins ADDR:$addr), + "mbarrier.arrive" # AddrSpace # ".b64", + [(set i64:$state, (Intrin addr:$addr))]>; + + def MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>; + def MBARRIER_ARRIVE_SHARED : + MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>; + + class MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs B64:$state), + (ins ADDR:$addr, B32:$count), + "mbarrier.arrive.noComplete" # AddrSpace # ".b64", + [(set i64:$state, (Intrin addr:$addr, i32:$count))]>; + + def MBARRIER_ARRIVE_NOCOMPLETE : + MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>; + def MBARRIER_ARRIVE_NOCOMPLETE_SHARED : + MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>; + + class MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs B64:$state), (ins ADDR:$addr), + "mbarrier.arrive_drop" # AddrSpace # ".b64", + [(set i64:$state, (Intrin addr:$addr))]>; + + def MBARRIER_ARRIVE_DROP : + MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>; + def MBARRIER_ARRIVE_DROP_SHARED : + MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>; + + class MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs B64:$state), + (ins ADDR:$addr, B32:$count), + "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64", + [(set i64:$state, (Intrin addr:$addr, i32:$count))]>; + + def MBARRIER_ARRIVE_DROP_NOCOMPLETE : + MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>; + def MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED : + MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared", + int_nvvm_mbarrier_arrive_drop_noComplete_shared>; + + class MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> : + BasicNVPTXInst<(outs B1:$res), (ins ADDR:$addr, B64:$state), + "mbarrier.test_wait" # AddrSpace # ".b64", + [(set i1:$res, (Intrin addr:$addr, i64:$state))]>; + + def MBARRIER_TEST_WAIT : + MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>; + def MBARRIER_TEST_WAIT_SHARED : + MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>; + + def MBARRIER_PENDING_COUNT : + BasicNVPTXInst<(outs B32:$res), (ins B64:$state), + "mbarrier.pending_count.b64", + [(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>; } - -defm MBARRIER_TEST_WAIT : - MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>; -defm MBARRIER_TEST_WAIT_SHARED : - MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>; - -class MBARRIER_PENDING_COUNT<Intrinsic Intrin> : - BasicNVPTXInst<(outs B32:$res), (ins B64:$state), - "mbarrier.pending_count.b64", - [(set i32:$res, (Intrin i64:$state))]>, - Requires<[hasPTX<70>, hasSM<80>]>; - -def MBARRIER_PENDING_COUNT : - MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>; - //----------------------------------- // Math Functions //----------------------------------- @@ -1449,15 +1407,11 @@ defm ABS_F64 : F_ABS<"f64", F64RT, support_ftz = false>; def fcopysign_nvptx : SDNode<"NVPTXISD::FCOPYSIGN", SDTFPBinOp>; -def COPYSIGN_F : - BasicNVPTXInst<(outs B32:$dst), (ins B32:$src0, B32:$src1), - "copysign.f32", - [(set f32:$dst, (fcopysign_nvptx f32:$src1, f32:$src0))]>; - -def COPYSIGN_D : - BasicNVPTXInst<(outs B64:$dst), (ins B64:$src0, B64:$src1), - "copysign.f64", - [(set f64:$dst, (fcopysign_nvptx f64:$src1, f64:$src0))]>; +foreach t = [F32RT, F64RT] in + def COPYSIGN_ # t : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src0, t.RC:$src1), + "copysign." # t.PtxType, + [(set t.Ty:$dst, (fcopysign_nvptx t.Ty:$src1, t.Ty:$src0))]>; // // Neg bf16, bf16x2 @@ -2255,38 +2209,35 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; // Scalar -class LDU_G<string TyStr, NVPTXRegClass regclass> - : NVPTXInst<(outs regclass:$result), (ins ADDR:$src), - "ldu.global." # TyStr # " \t$result, [$src];", []>; +class LDU_G<NVPTXRegClass regclass> + : NVPTXInst<(outs regclass:$result), (ins i32imm:$fromWidth, ADDR:$src), + "ldu.global.b$fromWidth \t$result, [$src];", []>; -def LDU_GLOBAL_i8 : LDU_G<"b8", B16>; -def LDU_GLOBAL_i16 : LDU_G<"b16", B16>; -def LDU_GLOBAL_i32 : LDU_G<"b32", B32>; -def LDU_GLOBAL_i64 : LDU_G<"b64", B64>; +def LDU_GLOBAL_i16 : LDU_G<B16>; +def LDU_GLOBAL_i32 : LDU_G<B32>; +def LDU_GLOBAL_i64 : LDU_G<B64>; // vector // Elementized vector ldu -class VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> +class VLDU_G_ELE_V2<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), - (ins ADDR:$src), - "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>; + (ins i32imm:$fromWidth, ADDR:$src), + "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; -class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> - : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, - regclass:$dst4), (ins ADDR:$src), - "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; +class VLDU_G_ELE_V4<NVPTXRegClass regclass> + : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins i32imm:$fromWidth, ADDR:$src), + "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; -def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", B16>; -def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", B16>; -def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", B32>; -def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", B64>; +def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<B16>; +def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<B32>; +def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<B64>; -def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", B16>; -def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", B16>; -def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>; +def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<B16>; +def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<B32>; //----------------------------------- @@ -2327,12 +2278,10 @@ class VLDG_G_ELE_V8<NVPTXRegClass regclass> : "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. -def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2<B16>; def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<B16>; def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<B32>; def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<B64>; -def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4<B16>; def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<B16>; def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<B32>; @@ -2342,19 +2291,19 @@ def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<B32>; multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> { if Supports32 then def "" : BasicNVPTXInst<(outs B32:$result), (ins B32:$src), - "cvta." # Str # ".u32", []>, Requires<Preds>; + "cvta." # Str # ".u32">, Requires<Preds>; def _64 : BasicNVPTXInst<(outs B64:$result), (ins B64:$src), - "cvta." # Str # ".u64", []>, Requires<Preds>; + "cvta." # Str # ".u64">, Requires<Preds>; } multiclass G_TO_NG<string Str, bit Supports32 = 1, list<Predicate> Preds = []> { if Supports32 then def "" : BasicNVPTXInst<(outs B32:$result), (ins B32:$src), - "cvta.to." # Str # ".u32", []>, Requires<Preds>; + "cvta.to." # Str # ".u32">, Requires<Preds>; def _64 : BasicNVPTXInst<(outs B64:$result), (ins B64:$src), - "cvta.to." # Str # ".u64", []>, Requires<Preds>; + "cvta.to." # Str # ".u64">, Requires<Preds>; } foreach space = ["local", "shared", "global", "const", "param"] in { @@ -4614,9 +4563,9 @@ def INT_PTX_SREG_LANEMASK_GT : PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>; let hasSideEffects = 1 in { -def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>; -def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>; -def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>; + def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>; + def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>; + def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>; } def: Pat <(i64 (readcyclecounter)), (SREG_CLOCK64)>; @@ -5096,37 +5045,36 @@ foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs, STMATRIXs) in def : MMA_PAT<mma>; multiclass MAPA<string suffix, Intrinsic Intr> { - def _32: BasicNVPTXInst<(outs B32:$d), (ins B32:$a, B32:$b), - "mapa" # suffix # ".u32", - [(set i32:$d, (Intr i32:$a, i32:$b))]>, - Requires<[hasSM<90>, hasPTX<78>]>; - def _32i: BasicNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b), - "mapa" # suffix # ".u32", - [(set i32:$d, (Intr i32:$a, imm:$b))]>, - Requires<[hasSM<90>, hasPTX<78>]>; - def _64: BasicNVPTXInst<(outs B64:$d), (ins B64:$a, B32:$b), - "mapa" # suffix # ".u64", - [(set i64:$d, (Intr i64:$a, i32:$b))]>, - Requires<[hasSM<90>, hasPTX<78>]>; - def _64i: BasicNVPTXInst<(outs B64:$d), (ins B64:$a, i32imm:$b), - "mapa" # suffix # ".u64", - [(set i64:$d, (Intr i64:$a, imm:$b))]>, - Requires<[hasSM<90>, hasPTX<78>]>; + let Predicates = [hasSM<90>, hasPTX<78>] in { + def _32: BasicNVPTXInst<(outs B32:$d), (ins B32:$a, B32:$b), + "mapa" # suffix # ".u32", + [(set i32:$d, (Intr i32:$a, i32:$b))]>; + def _32i: BasicNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b), + "mapa" # suffix # ".u32", + [(set i32:$d, (Intr i32:$a, imm:$b))]>; + def _64: BasicNVPTXInst<(outs B64:$d), (ins B64:$a, B32:$b), + "mapa" # suffix # ".u64", + [(set i64:$d, (Intr i64:$a, i32:$b))]>; + def _64i: BasicNVPTXInst<(outs B64:$d), (ins B64:$a, i32imm:$b), + "mapa" # suffix # ".u64", + [(set i64:$d, (Intr i64:$a, imm:$b))]>; + } } + defm mapa : MAPA<"", int_nvvm_mapa>; defm mapa_shared_cluster : MAPA<".shared::cluster", int_nvvm_mapa_shared_cluster>; multiclass GETCTARANK<string suffix, Intrinsic Intr> { - def _32: BasicNVPTXInst<(outs B32:$d), (ins B32:$a), - "getctarank" # suffix # ".u32", - [(set i32:$d, (Intr i32:$a))]>, - Requires<[hasSM<90>, hasPTX<78>]>; - def _64: BasicNVPTXInst<(outs B32:$d), (ins B64:$a), - "getctarank" # suffix # ".u64", - [(set i32:$d, (Intr i64:$a))]>, - Requires<[hasSM<90>, hasPTX<78>]>; + let Predicates = [hasSM<90>, hasPTX<78>] in { + def _32: BasicNVPTXInst<(outs B32:$d), (ins B32:$a), + "getctarank" # suffix # ".u32", + [(set i32:$d, (Intr i32:$a))]>; + def _64: BasicNVPTXInst<(outs B32:$d), (ins B64:$a), + "getctarank" # suffix # ".u64", + [(set i32:$d, (Intr i64:$a))]>; + } } defm getctarank : GETCTARANK<"", int_nvvm_getctarank>; @@ -5165,29 +5113,25 @@ def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm: [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>; } // isConvergent = true -def GRIDDEPCONTROL_LAUNCH_DEPENDENTS : - BasicNVPTXInst<(outs), (ins), - "griddepcontrol.launch_dependents", - [(int_nvvm_griddepcontrol_launch_dependents)]>, - Requires<[hasSM<90>, hasPTX<78>]>; - -def GRIDDEPCONTROL_WAIT : - BasicNVPTXInst<(outs), (ins), - "griddepcontrol.wait", - [(int_nvvm_griddepcontrol_wait)]>, - Requires<[hasSM<90>, hasPTX<78>]>; +let Predicates = [hasSM<90>, hasPTX<78>] in { + def GRIDDEPCONTROL_LAUNCH_DEPENDENTS : + BasicNVPTXInst<(outs), (ins), "griddepcontrol.launch_dependents", + [(int_nvvm_griddepcontrol_launch_dependents)]>; + def GRIDDEPCONTROL_WAIT : + BasicNVPTXInst<(outs), (ins), "griddepcontrol.wait", + [(int_nvvm_griddepcontrol_wait)]>; +} def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>; // Tcgen05 intrinsics -let isConvergent = true in { +let isConvergent = true, Predicates = [hasTcgen05Instructions] in { multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins ADDR:$dst, B32:$ncols), "tcgen05.alloc.cta_group::" # num # ".sync.aligned" # AS # ".b32", - [(Intr addr:$dst, B32:$ncols)]>, - Requires<[hasTcgen05Instructions]>; + [(Intr addr:$dst, B32:$ncols)]>; } defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<"", "1", int_nvvm_tcgen05_alloc_cg1>; @@ -5200,8 +5144,7 @@ multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins B32:$tmem_addr, B32:$ncols), "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32", - [(Intr B32:$tmem_addr, B32:$ncols)]>, - Requires<[hasTcgen05Instructions]>; + [(Intr B32:$tmem_addr, B32:$ncols)]>; } defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>; defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; @@ -5209,19 +5152,13 @@ defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2 multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins), "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", - [(Intr)]>, - Requires<[hasTcgen05Instructions]>; + [(Intr)]>; } defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; -def tcgen05_wait_ld: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::ld.sync.aligned", - [(int_nvvm_tcgen05_wait_ld)]>, - Requires<[hasTcgen05Instructions]>; - -def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligned", - [(int_nvvm_tcgen05_wait_st)]>, - Requires<[hasTcgen05Instructions]>; +def tcgen05_wait_ld: NullaryInst<"tcgen05.wait::ld.sync.aligned", int_nvvm_tcgen05_wait_ld>; +def tcgen05_wait_st: NullaryInst<"tcgen05.wait::st.sync.aligned", int_nvvm_tcgen05_wait_st>; multiclass TCGEN05_COMMIT_INTR<string AS, string num> { defvar prefix = "tcgen05.commit.cta_group::" # num #".mbarrier::arrive::one.shared::cluster"; @@ -5232,12 +5169,10 @@ multiclass TCGEN05_COMMIT_INTR<string AS, string num> { def "" : BasicNVPTXInst<(outs), (ins ADDR:$mbar), prefix # ".b64", - [(Intr addr:$mbar)]>, - Requires<[hasTcgen05Instructions]>; + [(Intr addr:$mbar)]>; def _MC : BasicNVPTXInst<(outs), (ins ADDR:$mbar, B16:$mc), prefix # ".multicast::cluster.b64", - [(IntrMC addr:$mbar, B16:$mc)]>, - Requires<[hasTcgen05Instructions]>; + [(IntrMC addr:$mbar, B16:$mc)]>; } defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<"", "1">; @@ -5249,8 +5184,7 @@ multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins ADDR:$tmem_addr), "tcgen05.shift.cta_group::" # num # ".down", - [(Intr addr:$tmem_addr)]>, - Requires<[hasTcgen05Instructions]>; + [(Intr addr:$tmem_addr)]>; } defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; @@ -5270,13 +5204,11 @@ multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> { def _cg1 : BasicNVPTXInst<(outs), (ins ADDR:$tmem_addr, B64:$sdesc), "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm, - [(IntrCG1 addr:$tmem_addr, B64:$sdesc)]>, - Requires<[hasTcgen05Instructions]>; + [(IntrCG1 addr:$tmem_addr, B64:$sdesc)]>; def _cg2 : BasicNVPTXInst<(outs), (ins ADDR:$tmem_addr, B64:$sdesc), "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm, - [(IntrCG2 addr:$tmem_addr, B64:$sdesc)]>, - Requires<[hasTcgen05Instructions]>; + [(IntrCG2 addr:$tmem_addr, B64:$sdesc)]>; } foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { @@ -5289,17 +5221,13 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { } } // isConvergent -let hasSideEffects = 1 in { +let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in { -def tcgen05_fence_before_thread_sync: BasicNVPTXInst<(outs), (ins), - "tcgen05.fence::before_thread_sync", - [(int_nvvm_tcgen05_fence_before_thread_sync)]>, - Requires<[hasTcgen05Instructions]>; + def tcgen05_fence_before_thread_sync: NullaryInst< + "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>; -def tcgen05_fence_after_thread_sync: BasicNVPTXInst<(outs), (ins), - "tcgen05.fence::after_thread_sync", - [(int_nvvm_tcgen05_fence_after_thread_sync)]>, - Requires<[hasTcgen05Instructions]>; + def tcgen05_fence_after_thread_sync: NullaryInst< + "tcgen05.fence::after_thread_sync", int_nvvm_tcgen05_fence_after_thread_sync>; } // hasSideEffects @@ -5392,17 +5320,17 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { // Bulk store instructions def st_bulk_imm : TImmLeaf<i64, [{ return Imm == 0; }]>; -def INT_NVVM_ST_BULK_GENERIC : - BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, B64:$size, i64imm:$value), - "st.bulk", - [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, - Requires<[hasSM<100>, hasPTX<86>]>; +let Predicates = [hasSM<100>, hasPTX<86>] in { + def INT_NVVM_ST_BULK_GENERIC : + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, B64:$size, i64imm:$value), + "st.bulk", + [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>; -def INT_NVVM_ST_BULK_SHARED_CTA: - BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, B64:$size, i64imm:$value), - "st.bulk.shared::cta", - [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, - Requires<[hasSM<100>, hasPTX<86>]>; + def INT_NVVM_ST_BULK_SHARED_CTA: + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, B64:$size, i64imm:$value), + "st.bulk.shared::cta", + [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>; +} // // clusterlaunchcontorl Instructions diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index d40886a..2e81ab1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -38,14 +38,6 @@ foreach i = 0...4 in { def R#i : NVPTXReg<"%r"#i>; // 32-bit def RL#i : NVPTXReg<"%rd"#i>; // 64-bit def RQ#i : NVPTXReg<"%rq"#i>; // 128-bit - def H#i : NVPTXReg<"%h"#i>; // 16-bit float - def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float - - // Arguments - def ia#i : NVPTXReg<"%ia"#i>; - def la#i : NVPTXReg<"%la"#i>; - def fa#i : NVPTXReg<"%fa"#i>; - def da#i : NVPTXReg<"%da"#i>; } foreach i = 0...31 in { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5779d4e..0e8828f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -243,8 +243,6 @@ public: createObjectTargetWriter() const override { return createPPCXCOFFObjectWriter(TT.isArch64Bit()); } - - std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; }; } // end anonymous namespace @@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const { return std::nullopt; } -std::optional<MCFixupKind> -XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const { - return StringSwitch<std::optional<MCFixupKind>>(Name) - .Case("R_REF", PPC::fixup_ppc_nofixup) - .Default(std::nullopt); -} - MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 9e8ee9f..df0c666 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -48,8 +48,7 @@ enum Fixups { /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer - /// register number. It can also be used to tie the ref symbol to prevent it - /// from being garbage collected on AIX. + /// register number. fixup_ppc_nofixup, /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index f75ab62..a04f404 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); + case XCOFF::RelocationType::R_REF: + return {XCOFF::RelocationType::R_REF, 0}; case PPC::fixup_ppc_half16: { const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Specifier) { @@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25}; case PPC::fixup_ppc_br24abs: return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25}; - case PPC::fixup_ppc_nofixup: { - if (Specifier == PPC::S_None) - return {XCOFF::RelocationType::R_REF, 0}; - else - llvm_unreachable("Unsupported Modifier"); - } break; case FK_Data_4: case FK_Data_8: const uint8_t SignAndSizeForFKData = diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index d295f35..1dc485d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class XXEvalPattern <dag pattern, bits<8> imm> : - Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} +// ============================================================================= +// XXEVAL Instruction Pattern Definitions +// ============================================================================= +// +// XXEVAL instruction performs 256 different logical operations on three vector +// operands using an 8-bit immediate value to select the operation. +// Format: xxeval XT, XA, XB, XC, IMM +// For example: +// Equivalent function A?xor(B,C):and(B,C) is performed by +// xxeval XT, XA, XB, XC, 22 +// +// REGISTER CLASS CONSTRAINTS: +// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64] +// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC +// ============================================================================= + +class XXEvalPattern<dag pattern, bits<8> imm> + : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} + +class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm> + : Pat<(Vt InputPattern), + !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)), + // VSRC path: direct XXEVAL for v4i32 and v2i64 + (XXEVAL $vA, $vB, $vC, Imm), + // VRRC path: wrap with COPY_TO_REGCLASS for other types + (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC), + (COPY_TO_REGCLASS Vt:$vB, VSRC), + (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm), + VRRC))> {} + +// ============================================================================= +// PatFrags for Bitcast-Aware Vector bitwise Operations +// +// Each PatFrags defines TWO alternatives for pattern matcher to choose: +// - Direct operation (for v4i32) +// - Bitcast operation (for other types: v2i64, v16i8, v8i16) +// ============================================================================= + +// Basic Binary Operations +def VAnd + : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b), + (bitconvert(and + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VXor + : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b), + (bitconvert(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b), + (bitconvert(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VNot + : PatFrags<(ops node:$a), [(vnot node:$a), + (bitconvert(vnot(v4i32(bitconvert node:$a))))]>; + +// Derived bitwise operations +// Vector NOR operation (not(or)) +def VNor + : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)), + (bitconvert(vnot(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// Vector EQV operation (not(xor)) +def VEqv + : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)), + (bitconvert(vnot(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd +// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C) +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT) +// - AND(B,C) is the "false" case op on vectors B and C +// ============================================================================= +multiclass XXEvalTernarySelectAnd<ValueType Vt> { + // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 22>; + + // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 24>; + + // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 25>; + + // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>; + + // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; +} let Predicates = [PrefixInstrs, HasP10Vector] in { let AddedComplexity = 400 in { @@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // (xor A, (or B, C)) def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + // XXEval Patterns for ternary Operations. + foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { + defm : XXEvalTernarySelectAnd<Ty>; + } + // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 def : Pat<(f128 (load PDForm:$src)), diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp index 5eb1f01..b7e2263 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -100,10 +100,14 @@ bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -189,10 +193,14 @@ bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; // Keep clustered nodes together. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 5e54b82..67cc01e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, + const MCDisassembler *Decoder) { + if (Imm < RISCVZC::RA_S0) + return MCDisassembler::Fail; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); +} + +static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, + uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, return S; } -static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Imm < RISCVZC::RA_S0) - return MCDisassembler::Fail; - return decodeZcmpRlist(Inst, Imm, Address, Decoder); -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 82e3b5c..9538b20 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -901,7 +901,7 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index 4c303a9..da6b95d 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt, // Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31. def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt, (sequence "X%u", 16, 31))>; + +def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>; +def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs, + (sequence "X%u", 16, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 34910b7..5998653 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 < C2 // -> (SignedBitfieldExtract X, msb, lsb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 > C2 // -> (NDS.BFOS X, lsb, msb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) // where C2 has 32 leading zeros and C3 trailing zeros. SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // - without Zba a tablegen pattern applies the very same // transform as we would have done here SDNode *SLLI = CurDAG->getMachineNode( - RISCV::SLLI, DL, VT, N0->getOperand(0), + RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LeadingZeros, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = llvm::countr_zero(Mask); if (LeadingZeros == 32 && TrailingZeros > ShAmt) { SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (TrailingOnes == 32) { SDNode *SRLI = CurDAG->getMachineNode( Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (HasBitTest && ShAmt + 1 == TrailingOnes) { SDNode *BEXTI = CurDAG->getMachineNode( Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, BEXTI); return; } const unsigned Msb = TrailingOnes - 1; const unsigned Lsb = ShAmt; - if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb)) + if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb)) return; unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; unsigned LShAmt = Subtarget->getXLen() - ExtSize; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRAI = CurDAG->getMachineNode( RISCV::SRAI, DL, VT, SDValue(SLLI, 0), @@ -2827,6 +2827,8 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, static bool isWorthFoldingAdd(SDValue Add) { for (auto *User : Add->users()) { if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE && + User->getOpcode() != RISCVISD::LD_RV32 && + User->getOpcode() != RISCVISD::SD_RV32 && User->getOpcode() != ISD::ATOMIC_LOAD && User->getOpcode() != ISD::ATOMIC_STORE) return false; @@ -2841,6 +2843,9 @@ static bool isWorthFoldingAdd(SDValue Add) { if (User->getOpcode() == ISD::ATOMIC_STORE && cast<AtomicSDNode>(User)->getVal() == Add) return false; + if (User->getOpcode() == RISCVISD::SD_RV32 && + (User->getOperand(0) == Add || User->getOperand(1) == Add)) + return false; if (isStrongerThanMonotonic(cast<MemSDNode>(User)->getSuccessOrdering())) return false; } @@ -2942,8 +2947,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9. bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset) { - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (SelectAddrFrameIndex(Addr, Base, Offset)) + return true; SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); @@ -2953,8 +2958,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, if (isUInt<9>(CVal)) { Base = Addr.getOperand(0); - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base)) + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 607edd3..c0ada51 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,27 +2739,6 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } -bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( - EVT ScalarTy) const { - if (!ScalarTy.isSimple()) - return false; - switch (ScalarTy.getSimpleVT().SimpleTy) { - case MVT::iPTR: - return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; - case MVT::i8: - case MVT::i16: - case MVT::i32: - case MVT::f16: - case MVT::bf16: - case MVT::f32: - return true; - case MVT::i64: - case MVT::f64: - return Subtarget.hasVInstructionsI64(); - default: - return false; - } -} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -20751,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getAllOnesConstant(DL, VT); return DAG.getConstant(0, DL, VT); } + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: { + SDValue Tuple = N->getOperand(2); + unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + + if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() || + Tuple.getOpcode() != RISCVISD::TUPLE_INSERT || + !Tuple.getOperand(0).isUndef()) + return SDValue(); + + SDValue Val = Tuple.getOperand(1); + unsigned Idx = Tuple.getConstantOperandVal(2); + + unsigned SEW = Val.getValueType().getScalarSizeInBits(); + assert(Log2_64(SEW) == N->getConstantOperandVal(6) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/N->getOperand(0), + /*IntID=*/ + DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT), + /*StoredVal=*/Val, + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/N->getOperand(4), + /*VL=*/N->getOperand(5)}; + + auto *OldMemSD = cast<MemIntrinsicSDNode>(N); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = OldMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList(MVT::Other); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT, + MMO); + } } } case ISD::EXPERIMENTAL_VP_REVERSE: @@ -20843,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case RISCVISD::TUPLE_EXTRACT: { + EVT VT = N->getValueType(0); + SDValue Tuple = N->getOperand(0); + unsigned Idx = N->getConstantOperandVal(1); + if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN) + break; + + unsigned NF = 0; + switch (Tuple.getConstantOperandVal(1)) { + default: + break; + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + break; + } + + if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF)) + break; + + unsigned SEW = VT.getScalarSizeInBits(); + assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/Tuple.getOperand(0), + /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT), + /*Passthru=*/Tuple.getOperand(2), + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/Tuple.getOperand(4), + /*VL=*/Tuple.getOperand(5), + /*Policy=*/Tuple.getOperand(6)}; + + auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = TupleMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList({VT, MVT::Other}); + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, + Ops, MemVT, MMO); + DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1)); + return Result.getValue(0); + } + case RISCVISD::TUPLE_INSERT: { + // tuple_insert tuple, undef, idx -> tuple + if (N->getOperand(1).isUndef()) + return N->getOperand(0); + break; + } } return SDValue(); @@ -22367,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( case CallingConv::C: case CallingConv::Fast: case CallingConv::SPIR_KERNEL: + case CallingConv::PreserveMost: case CallingConv::GRAAL: case CallingConv::RISCV_VectorCall: #define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN: @@ -22636,8 +22725,14 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; + + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -22895,6 +22990,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, if (CLI.CFIType) Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); + if (MF.getTarget().Options.EmitCallGraphSection && CB && + CB->isIndirectCall()) + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } @@ -22902,6 +23000,10 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); if (CLI.CFIType) Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); + + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); @@ -24260,7 +24362,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) + if (!isLegalElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a788c0b7..ca70c46 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,7 +384,6 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; - bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index dd365cf..8297d50 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { +let IsSignExtendingOpW = 1 in def CLS : Unary_r<0b011000000011, 0b001, "cls">; def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] @@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in { def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; +let IsSignExtendingOpW = 1 in { def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; +} } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 6afc942d..03e6f43 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -1510,21 +1510,6 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass, let VLMul = MInfo.value; } -class VPseudoTernaryNoMask<VReg RetClass, - RegisterClass Op1Class, - DAGOperand Op2Class, - string Constraint> : - RISCVVPseudo<(outs RetClass:$rd), - (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew)> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = !interleave([Constraint, "$rd = $rs3"], ","); - let HasVLOp = 1; - let HasSEWOp = 1; -} - class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index f391300..5265613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in { def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">; def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">; - let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in - def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs), - (ins uimm5nonzero:$imm), - "qc.c.delay", "$imm"> { - let Inst{12} = 0; - let Inst{11-7} = 0; - let Inst{6-2} = imm{4-0}; - } + // qc.c.delay implemented as an alias, below } // Predicates = [HasVendorXqcisync, IsRV32] let Predicates = [HasVendorXqcisim, IsRV32] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10), - "qc.psyscalli", "$imm10"> { - bits<10> imm10; - - let rs1 = 0; - let rd = 0; - let imm12 = {0b00, imm10}; - } - def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8), "qc.pputci", "$imm8"> { bits<8> imm8; @@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { let imm12 = {0b0100, imm8}; } - def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">; - def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">; - def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">; - def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">; - def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">; - def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">; - def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">; - - def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> { - let rd = 0; - let imm = 0; - } + // The other instructions are all implemented as aliases, below } // mayLoad = 0, mayStore = 0, hasSideEffects = 1 } // Predicates = [HasVendorXqcisim, IsRV32] @@ -1218,6 +1191,27 @@ let EmitPriority = 0 in { } // EmitPriority = 0 } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasVendorXqcisim, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>; + + def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>; + def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>; + def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>; + def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>; + def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>; + def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>; + def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>; + def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>; +} // EmitPriority = 1 +} // Predicates = [HasVendorXqcisim, IsRV32] + +let Predicates = [HasVendorXqcisync, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>; +} +} // Predicates = [HasVendorXqcisync, IsRV32] + //===----------------------------------------------------------------------===// // Pseudo-instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 3cbe668..726920e 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || + if (!isLegalElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; @@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at most - // the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); - CI->addParamAttr(0, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - CallInst *VlsegN = Builder.CreateIntrinsic( FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); @@ -289,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - unsigned Index; - // If the segment store only has one active lane (i.e. the interleave is - // just a spread shuffle), we can use a strided store instead. This will - // be equally fast, and create less vector register pressure. - if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && - isSpreadMask(Mask, Factor, Index)) { - unsigned ScalarSizeInBytes = - DL.getTypeStoreSize(ShuffleVTy->getElementType()); - Value *Data = SVI->getOperand(0); - Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at - // most the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, - {VTy, BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, LaneMask, VL}); - Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); - CI->addParamAttr(1, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - return true; - } - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 5404123..7e58b6f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList + : CSR_RT_MostRegs_SaveList; if (MF->getFunction().hasFnAttribute("interrupt")) { if (Subtarget.hasVInstructions()) { if (Subtarget.hasStdExtD()) @@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int64_t Val = Offset.getFixed(); int64_t Lo12 = SignExtend64<12>(Val); unsigned Opc = MI.getOpcode(); + if (Opc == RISCV::ADDI && !isInt<12>(Val)) { // We chose to emit the canonical immediate sequence rather than folding // the offset into the using add under the theory that doing so doesn't @@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); + } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + // MIPS Prefetch instructions require the offset to be 9 bits encoded. + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || Opc == RISCV::PseudoRV32ZdinxSD) && Lo12 >= 2044) { @@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; - switch (Subtarget.getTargetABI()) { + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (CC == CallingConv::PreserveMost) { + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return CSR_RT_MostRegs_RVE_RegMask; + return CSR_RT_MostRegs_RegMask; + } + switch (ABI) { default: llvm_unreachable("Unrecognized ABI"); case RISCVABI::ABI_ILP32E: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fd634b5..0d5eb86 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{ {Intrinsic::roundeven, MVT::f64, 9}, {Intrinsic::rint, MVT::f32, 7}, {Intrinsic::rint, MVT::f64, 7}, - {Intrinsic::lrint, MVT::i32, 1}, - {Intrinsic::lrint, MVT::i64, 1}, - {Intrinsic::llrint, MVT::i64, 1}, {Intrinsic::nearbyint, MVT::f32, 9}, {Intrinsic::nearbyint, MVT::f64, 9}, {Intrinsic::bswap, MVT::i16, 3}, @@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, switch (ICA.getID()) { case Intrinsic::lrint: case Intrinsic::llrint: - // We can't currently lower half or bfloat vector lrint/llrint. - if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); - VecTy && VecTy->getElementType()->is16bitFPTy()) - return InstructionCost::getInvalid(); - [[fallthrough]]; + case Intrinsic::lround: + case Intrinsic::llround: { + auto LT = getTypeLegalizationCost(RetTy); + Type *SrcTy = ICA.getArgTypes().front(); + auto SrcLT = getTypeLegalizationCost(SrcTy); + if (ST->hasVInstructions() && LT.second.isVector()) { + SmallVector<unsigned, 2> Ops; + unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType()); + unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType()); + if (LT.second.getVectorElementType() == MVT::bf16) { + if (!ST->hasVInstructionsBF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V}; + } else if (LT.second.getVectorElementType() == MVT::f16 && + !ST->hasVInstructionsF16()) { + if (!ST->hasVInstructionsF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V}; + + } else if (SrcEltSz > DstEltSz) { + Ops = {RISCV::VFNCVT_X_F_W}; + } else if (SrcEltSz < DstEltSz) { + Ops = {RISCV::VFWCVT_X_F_V}; + } else { + Ops = {RISCV::VFCVT_X_F_V}; + } + + // We need to use the source LMUL in the case of a narrowing op, and the + // destination LMUL otherwise. + if (SrcEltSz > DstEltSz) + return SrcLT.first * + getRISCVInstructionCost(Ops, SrcLT.second, CostKind); + return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind); + } + break; + } case Intrinsic::ceil: case Intrinsic::floor: case Intrinsic::trunc: @@ -2593,18 +2627,17 @@ void RISCVTTIImpl::getUnrollingPreferences( if (L->getNumBlocks() > 4) return; - // Don't unroll vectorized loops, including the remainder loop - if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) - return; - // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. + // inlining. Don't unroll auto-vectorized loops either, though do allow + // unrolling of the scalar remainder. + bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); InstructionCost Cost = 0; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - // Initial setting - Don't unroll loops containing vectorized - // instructions. - if (I.getType()->isVectorTy()) + // Both auto-vectorized loops and the scalar remainder have the + // isvectorized attribute, so differentiate between them by the presence + // of vector instructions. + if (IsVectorized && I.getType()->isVectorTy()) return; if (isa<CallInst>(I) || isa<InvokeInst>(I)) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index f0510ec..d62d99c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index c1cc19b..050de3d 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -646,8 +646,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (!Src || Src->hasUnmodeledSideEffects() || Src->getParent() != MI.getParent() || !RISCVII::isFirstDefTiedToFirstUse(Src->getDesc()) || - !RISCVII::hasVLOp(Src->getDesc().TSFlags) || - !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) + !RISCVII::hasVLOp(Src->getDesc().TSFlags)) return false; // Src's dest needs to have the same EEW as MI's input. @@ -681,12 +680,14 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { *Src->getParent()->getParent())); } - // If MI was tail agnostic and the VL didn't increase, preserve it. - int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED; - if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) && - RISCV::isVLKnownLE(MI.getOperand(3), SrcVL)) - Policy |= RISCVVType::TAIL_AGNOSTIC; - Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); + if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) { + // If MI was tail agnostic and the VL didn't increase, preserve it. + int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED; + if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) && + RISCV::isVLKnownLE(MI.getOperand(3), SrcVL)) + Policy |= RISCVVType::TAIL_AGNOSTIC; + Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); + } MRI->constrainRegClass(Src->getOperand(0).getReg(), MRI->getRegClass(MI.getOperand(0).getReg())); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 3c631ce..c4c7e85 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -194,6 +194,42 @@ class SPIRVEmitIntrinsics void useRoundingMode(ConstrainedFPIntrinsic *FPI, IRBuilder<> &B); + // Tries to walk the type accessed by the given GEP instruction. + // For each nested type access, one of the 2 callbacks is called: + // - OnLiteralIndexing when the index is a known constant value. + // Parameters: + // PointedType: the pointed type resulting of this indexing. + // If the parent type is an array, this is the index in the array. + // If the parent type is a struct, this is the field index. + // Index: index of the element in the parent type. + // - OnDynamnicIndexing when the index is a non-constant value. + // This callback is only called when indexing into an array. + // Parameters: + // ElementType: the type of the elements stored in the parent array. + // Offset: the Value* containing the byte offset into the array. + // Return true if an error occured during the walk, false otherwise. + bool walkLogicalAccessChain( + GetElementPtrInst &GEP, + const std::function<void(Type *PointedType, uint64_t Index)> + &OnLiteralIndexing, + const std::function<void(Type *ElementType, Value *Offset)> + &OnDynamicIndexing); + + // Returns the type accessed using the given GEP instruction by relying + // on the GEP type. + // FIXME: GEP types are not supposed to be used to retrieve the pointed + // type. This must be fixed. + Type *getGEPType(GetElementPtrInst *GEP); + + // Returns the type accessed using the given GEP instruction by walking + // the source type using the GEP indices. + // FIXME: without help from the frontend, this method cannot reliably retrieve + // the stored type, nor can robustly determine the depth of the type + // we are accessing. + Type *getGEPTypeLogical(GetElementPtrInst *GEP); + + Instruction *buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP); + public: static char ID; SPIRVEmitIntrinsics(SPIRVTargetMachine *TM = nullptr) @@ -246,6 +282,17 @@ bool expectIgnoredInIRTranslation(const Instruction *I) { } } +// Returns the source pointer from `I` ignoring intermediate ptrcast. +Value *getPointerRoot(Value *I) { + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::spv_ptrcast) { + Value *V = II->getArgOperand(0); + return getPointerRoot(V); + } + } + return I; +} + } // namespace char SPIRVEmitIntrinsics::ID = 0; @@ -555,7 +602,111 @@ void SPIRVEmitIntrinsics::maybeAssignPtrType(Type *&Ty, Value *Op, Type *RefTy, Ty = RefTy; } -Type *getGEPType(GetElementPtrInst *Ref) { +bool SPIRVEmitIntrinsics::walkLogicalAccessChain( + GetElementPtrInst &GEP, + const std::function<void(Type *, uint64_t)> &OnLiteralIndexing, + const std::function<void(Type *, Value *)> &OnDynamicIndexing) { + // We only rewrite i8* GEP. Other should be left as-is. + // Valid i8* GEP must always have a single index. + assert(GEP.getSourceElementType() == + IntegerType::getInt8Ty(CurrF->getContext())); + assert(GEP.getNumIndices() == 1); + + auto &DL = CurrF->getDataLayout(); + Value *Src = getPointerRoot(GEP.getPointerOperand()); + Type *CurType = deduceElementType(Src, true); + + Value *Operand = *GEP.idx_begin(); + ConstantInt *CI = dyn_cast<ConstantInt>(Operand); + if (!CI) { + ArrayType *AT = dyn_cast<ArrayType>(CurType); + // Operand is not constant. Either we have an array and accept it, or we + // give up. + if (AT) + OnDynamicIndexing(AT->getElementType(), Operand); + return AT == nullptr; + } + + assert(CI); + uint64_t Offset = CI->getZExtValue(); + + do { + if (ArrayType *AT = dyn_cast<ArrayType>(CurType)) { + uint32_t EltTypeSize = DL.getTypeSizeInBits(AT->getElementType()) / 8; + assert(Offset < AT->getNumElements() * EltTypeSize); + uint64_t Index = Offset / EltTypeSize; + Offset = Offset - (Index * EltTypeSize); + CurType = AT->getElementType(); + OnLiteralIndexing(CurType, Index); + } else if (StructType *ST = dyn_cast<StructType>(CurType)) { + uint32_t StructSize = DL.getTypeSizeInBits(ST) / 8; + assert(Offset < StructSize); + const auto &STL = DL.getStructLayout(ST); + unsigned Element = STL->getElementContainingOffset(Offset); + Offset -= STL->getElementOffset(Element); + CurType = ST->getElementType(Element); + OnLiteralIndexing(CurType, Element); + } else { + // Vector type indexing should not use GEP. + // So if we have an index left, something is wrong. Giving up. + return true; + } + } while (Offset > 0); + + return false; +} + +Instruction * +SPIRVEmitIntrinsics::buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP) { + auto &DL = CurrF->getDataLayout(); + IRBuilder<> B(GEP.getParent()); + B.SetInsertPoint(&GEP); + + std::vector<Value *> Indices; + Indices.push_back(ConstantInt::get( + IntegerType::getInt32Ty(CurrF->getContext()), 0, /* Signed= */ false)); + walkLogicalAccessChain( + GEP, + [&Indices, &B](Type *EltType, uint64_t Index) { + Indices.push_back( + ConstantInt::get(B.getInt64Ty(), Index, /* Signed= */ false)); + }, + [&Indices, &B, &DL](Type *EltType, Value *Offset) { + uint32_t EltTypeSize = DL.getTypeSizeInBits(EltType) / 8; + Value *Index = B.CreateUDiv( + Offset, ConstantInt::get(Offset->getType(), EltTypeSize, + /* Signed= */ false)); + Indices.push_back(Index); + }); + + SmallVector<Type *, 2> Types = {GEP.getType(), GEP.getOperand(0)->getType()}; + SmallVector<Value *, 4> Args; + Args.push_back(B.getInt1(GEP.isInBounds())); + Args.push_back(GEP.getOperand(0)); + llvm::append_range(Args, Indices); + auto *NewI = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args}); + replaceAllUsesWithAndErase(B, &GEP, NewI); + return NewI; +} + +Type *SPIRVEmitIntrinsics::getGEPTypeLogical(GetElementPtrInst *GEP) { + + Type *CurType = GEP->getResultElementType(); + + bool Interrupted = walkLogicalAccessChain( + *GEP, [&CurType](Type *EltType, uint64_t Index) { CurType = EltType; }, + [&CurType](Type *EltType, Value *Index) { CurType = EltType; }); + + return Interrupted ? GEP->getResultElementType() : CurType; +} + +Type *SPIRVEmitIntrinsics::getGEPType(GetElementPtrInst *Ref) { + if (Ref->getSourceElementType() == + IntegerType::getInt8Ty(CurrF->getContext()) && + TM->getSubtargetImpl()->isLogicalSPIRV()) { + return getGEPTypeLogical(Ref); + } + Type *Ty = nullptr; // TODO: not sure if GetElementPtrInst::getTypeAtIndex() does anything // useful here @@ -1395,6 +1546,13 @@ Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) { } Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) { + if (I.getSourceElementType() == IntegerType::getInt8Ty(CurrF->getContext()) && + TM->getSubtargetImpl()->isLogicalSPIRV()) { + Instruction *Result = buildLogicalAccessChainFromGEP(I); + if (Result) + return Result; + } + IRBuilder<> B(I.getParent()); B.SetInsertPoint(&I); SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()}; @@ -1588,7 +1746,24 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, } if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { Value *Pointer = GEPI->getPointerOperand(); - Type *OpTy = GEPI->getSourceElementType(); + Type *OpTy = nullptr; + + // Knowing the accessed type is mandatory for logical SPIR-V. Sadly, + // the GEP source element type should not be used for this purpose, and + // the alternative type-scavenging method is not working. + // Physical SPIR-V can work around this, but not logical, hence still + // try to rely on the broken type scavenging for logical. + bool IsRewrittenGEP = + GEPI->getSourceElementType() == IntegerType::getInt8Ty(I->getContext()); + if (IsRewrittenGEP && TM->getSubtargetImpl()->isLogicalSPIRV()) { + Value *Src = getPointerRoot(Pointer); + OpTy = GR->findDeducedElementType(Src); + } + + // In all cases, fall back to the GEP type if type scavenging failed. + if (!OpTy) + OpTy = GEPI->getSourceElementType(); + replacePointerOperandWithPtrCast(I, Pointer, OpTy, 0, B); if (isNestedPointer(OpTy)) insertTodoType(Pointer); diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 5cda6a0..7505507 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - // We expect the codegen to avoid doing implicit bitcast from a load. - assert(TargetType->getElementType() == SourceType->getElementType()); - assert(TargetType->getNumElements() < SourceType->getNumElements()); - + assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); + Value *AssignValue = NewLoad; + if (TargetType->getElementType() != SourceType->getElementType()) { + AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, + {TargetType, SourceType}, {NewLoad}); + buildAssignType(B, TargetType, AssignValue); + } SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; - Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask); + Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask); buildAssignType(B, TargetType, Output); return Output; } @@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass { Output = loadFirstValueFromAggregate(B, SVT->getElementType(), OriginalOperand, LI); } - // Destination is a smaller vector than source. + // Destination is a smaller vector than source or different vector type. // - float3 v3 = vector4; + // - float4 v2 = int4; else if (SVT && DVT) Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand); // Destination is the scalar type stored at the start of an aggregate. diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 721f64a..1995e0f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal(); } + getActionDefinitionsBuilder(G_IS_FPCLASS).custom(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType, bool SPIRVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { - auto Opc = MI.getOpcode(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - if (Opc == TargetOpcode::G_ICMP) { + switch (MI.getOpcode()) { + default: + // TODO: implement legalization for other opcodes. + return true; + case TargetOpcode::G_IS_FPCLASS: + return legalizeIsFPClass(Helper, MI, LocObserver); + case TargetOpcode::G_ICMP: { assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg())); auto &Op0 = MI.getOperand(2); auto &Op1 = MI.getOperand(3); @@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom( } return true; } - // TODO: implement legalization for other opcodes. + } +} + +// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted +// to ensure that all instructions created during the lowering have SPIR-V types +// assigned to them. +bool SPIRVLegalizerInfo::legalizeIsFPClass( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); + + auto &MIRBuilder = Helper.MIRBuilder; + auto &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Type *LLVMDstTy = + IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits()); + if (DstTy.isVector()) + LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount()); + SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType( + LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + unsigned BitSize = SrcTy.getScalarSizeInBits(); + const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); + + LLT IntTy = LLT::scalar(BitSize); + Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize); + if (SrcTy.isVector()) { + IntTy = LLT::vector(SrcTy.getElementCount(), IntTy); + LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount()); + } + SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType( + LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + // Clang doesn't support capture of structured bindings: + LLT DstTyCopy = DstTy; + const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) { + // Assign this MI's (assumed only) destination to one of the two types we + // expect: either the G_IS_FPCLASS's destination type, or the integer type + // bitcast from the source type. + LLT MITy = MRI.getType(MI.getReg(0)); + assert((MITy == IntTy || MITy == DstTyCopy) && + "Unexpected LLT type while lowering G_IS_FPCLASS"); + auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy; + GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF); + return MI; + }; + + // Helper to build and assign a constant in one go + const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder { + if (!Ty.isFixedVector()) + return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C)); + auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C); + assert((Ty == IntTy || Ty == DstTyCopy) && + "Unexpected LLT type while lowering constant for G_IS_FPCLASS"); + SPIRVType *VecEltTy = GR->getOrCreateSPIRVType( + (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF); + return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC)); + }; + + if (Mask == fcNone) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0)); + MI.eraseFromParent(); + return true; + } + if (Mask == fcAllFlags) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1)); + MI.eraseFromParent(); + return true; + } + + // Note that rather than creating a COPY here (between a floating-point and + // integer type of the same size) we create a SPIR-V bitcast immediately. We + // can't create a G_BITCAST because the LLTs are the same, and we can't seem + // to correctly lower COPYs to SPIR-V bitcasts at this moment. + Register ResVReg = MRI.createGenericVirtualRegister(IntTy); + MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy)); + GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF()); + auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast) + .addDef(ResVReg) + .addUse(GR->getSPIRVTypeID(SPIRVIntTy)) + .addUse(SrcReg); + AsInt = assignSPIRVTy(std::move(AsInt)); + + // Various masks. + APInt SignBit = APInt::getSignMask(BitSize); + APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign. + APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit. + APInt ExpMask = Inf; + APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; + APInt QNaNBitMask = + APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); + APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); + + auto SignBitC = buildSPIRVConstant(IntTy, SignBit); + auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask); + auto InfC = buildSPIRVConstant(IntTy, Inf); + auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask); + auto ZeroC = buildSPIRVConstant(IntTy, 0); + + auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC)); + auto Sign = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs)); + + auto Res = buildSPIRVConstant(DstTy, 0); + + const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) { + Res = assignSPIRVTy( + MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend)))); + }; + + // Tests that involve more than one class should be processed first. + if ((Mask & fcFinite) == fcFinite) { + // finite(V) ==> abs(V) u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, + ExpMaskC)); + Mask &= ~fcFinite; + } else if ((Mask & fcFinite) == fcPosFinite) { + // finite(V) && V > 0 ==> V u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt, + ExpMaskC)); + Mask &= ~fcPosFinite; + } else if ((Mask & fcFinite) == fcNegFinite) { + // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1 + auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, + DstTy, Abs, ExpMaskC)); + appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign)); + Mask &= ~fcNegFinite; + } + + if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + // TODO: Handle inverted case + if (PartialCheck == (fcZero | fcSubnormal)) { + auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC)); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + ExpBits, ZeroC)); + Mask &= ~PartialCheck; + } + } + + // Check for individual classes. + if (FPClassTest PartialCheck = Mask & fcZero) { + if (PartialCheck == fcPosZero) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, ZeroC)); + else if (PartialCheck == fcZero) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC)); + else // fcNegZero + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, SignBitC)); + } + + if (FPClassTest PartialCheck = Mask & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) + auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; + auto OneC = buildSPIRVConstant(IntTy, 1); + auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); + auto SubnormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, + buildSPIRVConstant(IntTy, AllOneMantissa))); + if (PartialCheck == fcNegSubnormal) + SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); + appendToRes(std::move(SubnormalRes)); + } + + if (FPClassTest PartialCheck = Mask & fcInf) { + if (PartialCheck == fcPosInf) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, InfC)); + else if (PartialCheck == fcInf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC)); + else { // fcNegInf + APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt(); + auto NegInfC = buildSPIRVConstant(IntTy, NegInf); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, NegInfC)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNan) { + auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask); + if (PartialCheck == fcNan) { + // isnan(V) ==> abs(V) u> int(inf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + } else if (PartialCheck == fcQNan) { + // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs, + InfWithQnanBitC)); + } else { // fcSNan + // issignaling(V) ==> abs(V) u> unsigned(Inf) && + // abs(V) u< (unsigned(Inf) | quiet_bit) + auto IsNan = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp( + CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC)); + appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNormal) { + // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< + // (max_exp-1)) + APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); + auto ExpMinusOne = assignSPIRVTy( + MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB))); + APInt MaxExpMinusOne = ExpMask - ExpLSB; + auto NormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, + buildSPIRVConstant(IntTy, MaxExpMinusOne))); + if (PartialCheck == fcNegNormal) + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign); + else if (PartialCheck == fcPosNormal) { + auto PosSign = assignSPIRVTy(MIRBuilder.buildXor( + DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask))); + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign); + } + appendToRes(std::move(NormalRes)); + } + + MIRBuilder.buildCopy(DstReg, Res); + MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 6335f21..eeefa42 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -30,6 +30,10 @@ public: bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override; SPIRVLegalizerInfo(const SPIRVSubtarget &ST); + +private: + bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h index 43bf6e9..60c4e2d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h @@ -59,6 +59,8 @@ public: Intrinsic::ID IID) const override; Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override; + + bool allowVectorElementIndexingUsingGEP() const override { return false; } }; } // namespace llvm diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index d5f8492..b2cfd04 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -165,7 +165,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned BitSize = getFixupKindInfo(Kind).TargetSize; unsigned Size = (BitSize + 7) / 8; - assert(Offset + Size <= Data.size() && "Invalid fixup offset!"); + assert(Offset + Size <= F.getSize() && "Invalid fixup offset!"); // Big-endian insertion of Size bytes. Value = extractBitsForFixup(Kind, Value, Fixup, getContext()); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index e30d723..fb0a47d 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -9044,7 +9044,7 @@ static unsigned detectEvenOddMultiplyOperand(const SelectionDAG &DAG, if (unsigned(ShuffleMask[Elt]) != 2 * Elt) CanUseEven = false; if (unsigned(ShuffleMask[Elt]) != 2 * Elt + 1) - CanUseEven = true; + CanUseOdd = false; } Op = Op.getOperand(0); if (CanUseEven) diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index f987621..b02b6af 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -174,7 +174,7 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits // from the fixup value. The Value has been "split up" into the // appropriate bitfields above. diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 837fd8e..84eb15f 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -97,7 +97,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index a606209..089be5f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -49,6 +49,8 @@ def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable FP16 instructions">; +def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; + def FeatureMultiMemory : SubtargetFeature<"multimemory", "HasMultiMemory", "true", "Enable multiple memories">; @@ -71,7 +73,6 @@ def FeatureReferenceTypes : SubtargetFeature<"reference-types", "HasReferenceTypes", "true", "Enable reference types">; -def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD", "Enable relaxed-simd instructions">; @@ -139,10 +140,10 @@ def : ProcessorModel<"lime1", NoSchedModel, def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, FeatureCallIndirectOverlong, FeatureExceptionHandling, - FeatureExtendedConst, FeatureFP16, FeatureMultiMemory, - FeatureMultivalue, FeatureMutableGlobals, + FeatureExtendedConst, FeatureFP16, FeatureGC, + FeatureMultiMemory, FeatureMultivalue, FeatureMutableGlobals, FeatureNontrappingFPToInt, FeatureRelaxedSIMD, - FeatureReferenceTypes, FeatureGC, FeatureSIMD128, + FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt, FeatureTailCall]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index cd434f7..3f80b2a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::MUL); +static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v8i32 && VT != MVT::v16i32) return SDValue(); @@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performMulCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::MUL); + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) + return Res; + + // We don't natively support v16i8 mul, but we do support v8i16 so split the + // inputs and extend them to v8i16. Only do this before legalization in case + // a narrow vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); + SDValue MulHigh = DAG.getBitcast( + VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::MUL: - return performMulCombine(N, DCI.DAG); + return performMulCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 2b632fd..13d048a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -50,6 +50,9 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<(all_of FeatureFP16), "fp16">; +def HasGC : Predicate<"Subtarget->hasGC()">, + AssemblerPredicate<(all_of FeatureGC), "gc">; + def HasMultiMemory : Predicate<"Subtarget->hasMultiMemory()">, AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">; @@ -76,9 +79,6 @@ def HasReferenceTypes : Predicate<"Subtarget->hasReferenceTypes()">, AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">; -def HasGC : Predicate<"Subtarget->hasGC()">, - AssemblerPredicate<(all_of FeatureGC), "gc">; - def HasRelaxedSIMD : Predicate<"Subtarget->hasRelaxedSIMD()">, AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d13862f..143298b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 28f6599..c3990d1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { for (Instruction &I : BB) { if (I.getType()->isVoidTy()) continue; + + if (isa<AllocaInst>(&I)) { + // If the alloca has any lifetime marker that is no longer dominated + // by the alloca, remove all lifetime markers. Lifetime markers must + // always work directly on the alloca, and this is no longer possible. + bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) { + auto *UserI = cast<Instruction>(U); + return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI); + }); + if (HasNonDominatedLifetimeMarker) { + for (User *U : make_early_inc_range(I.users())) { + auto *UserI = cast<Instruction>(U); + if (UserI->isLifetimeStartOrEnd()) + UserI->eraseFromParent(); + } + } + } + unsigned VarID = SSA.AddVariable(I.getName(), I.getType()); // If a value is defined by an invoke instruction, it is only available in // its normal destination and not in its unwind destination. @@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Setjmp preparation + SmallVector<AllocaInst *> StaticAllocas; + for (Instruction &I : F.getEntryBlock()) + if (auto *AI = dyn_cast<AllocaInst>(&I)) + if (AI->isStaticAlloca()) + StaticAllocas.push_back(AI); + BasicBlock *Entry = &F.getEntryBlock(); DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram()); SplitBlock(Entry, &*Entry->getFirstInsertionPt()); + // Move static allocas back into the entry block, so they stay static. + for (AllocaInst *AI : StaticAllocas) + AI->moveBefore(Entry->getTerminator()->getIterator()); + IRB.SetInsertPoint(Entry->getTerminator()->getIterator()); // This alloca'ed pointer is used by the runtime to identify function // invocations. It's just for pointer comparisons. It will never be diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index f814274..2f88bbb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -46,12 +46,12 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasExceptionHandling = false; bool HasExtendedConst = false; bool HasFP16 = false; + bool HasGC = false; bool HasMultiMemory = false; bool HasMultivalue = false; bool HasMutableGlobals = false; bool HasNontrappingFPToInt = false; bool HasReferenceTypes = false; - bool HasGC = false; bool HasSignExt = false; bool HasTailCall = false; bool HasWideArithmetic = false; diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 1bf9f8b..f9bd233 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources} IRPrinter Instrumentation MC + ObjCARC ProfileData Scalar SelectionDAG diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 7f9d474..1efef83 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -690,7 +690,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; unsigned Size = getFixupKindSize(Kind); - assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); int64_t SignedValue = static_cast<int64_t>(Value); if (IsResolved && Fixup.isPCRel()) { diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 0ff7f23..067bd43 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3673,6 +3673,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CLI.NumResultRegs = RVLocs.size(); CLI.Call = MIB; + // Add call site info for call graph section. + if (TM.Options.EmitCallGraphSection && CB && CB->isIndirectCall()) { + MachineFunction::CallSiteInfo CSInfo(*CB); + MF->addCallSiteInfo(CLI.Call, std::move(CSInfo)); + } + return true; } @@ -4042,6 +4048,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, MO.setReg(IndexReg); } + if (MI->isCall()) + FuncInfo.MF->moveAdditionalCallInfo(MI, Result); Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); Result->cloneInstrSymbols(*FuncInfo.MF, *MI); MachineBasicBlock::iterator I(MI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11ab8dc..bbbb1d9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58071,14 +58071,24 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { Ops[3] = Op1.getOperand(0); Ops[4] = Op1.getOperand(1); } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) { + SDValue Src = Op1; + SDValue Op10 = Op1.getOperand(0); + if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) { + // res, flags2 = sub 0, (and (xor X, -1), Y) + // cload/cstore ..., cond_ne, flag2 + // -> + // res, flags2 = sub 0, (and X, Y) + // cload/cstore ..., cond_e, flag2 + Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0), + Op1.getOperand(1)); + Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8); + } // res, flags2 = sub 0, (and X, Y) - // cload/cstore ..., cond_ne, flag2 + // cload/cstore ..., cc, flag2 // -> - // res, flags2 = and X, Y - // cload/cstore ..., cond_ne, flag2 - Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0), - Op1.getOperand(1)) - .getValue(1); + // res, flags2 = cmp (and X, Y), 0 + // cload/cstore ..., cc, flag2 + Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0)); } else { return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b4639ac..5862c7e 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2060,6 +2060,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + if (IsIndirectCall && !IsWin64 && M->getModuleFlag("import-call-optimization")) errorUnsupported(DAG, dl, |