diff options
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64Combine.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 237 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 29 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp | 57 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 47 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp | 90 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 2 |
13 files changed, 361 insertions, 188 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ca09598..99f0af5 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -39,8 +39,8 @@ let Predicates = [HasDotProd] in { def ext_addv_to_udot_addv : GICombineRule< (defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo), (match (wip_match_opcode G_VECREDUCE_ADD):$root, - [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]), - (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) + [{ return matchExtAddvToDotAddv(*${root}, MRI, STI, ${matchinfo}); }]), + (apply [{ applyExtAddvToDotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) >; } @@ -62,8 +62,10 @@ class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICom def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>; def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>; +def push_mul_through_zext : push_opcode_through_ext<G_MUL, G_ZEXT>; def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>; def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>; +def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, @@ -75,8 +77,10 @@ def AArch64PreLegalizerCombiner: GICombiner< ext_uaddv_to_uaddlv, push_sub_through_zext, push_add_through_zext, + push_mul_through_zext, push_sub_through_sext, - push_add_through_sext]> { + push_add_through_sext, + push_mul_through_sext]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index eca7ca5..ad42f4b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM_PSEUDO, AArch64::LDNT1B_2Z_PSEUDO); @@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM_PSEUDO, AArch64::LDNT1H_2Z_PSEUDO); @@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM_PSEUDO, AArch64::LDNT1W_2Z_PSEUDO); @@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM_PSEUDO, AArch64::LDNT1D_2Z_PSEUDO); @@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM_PSEUDO, AArch64::LDNT1B_4Z_PSEUDO); @@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM_PSEUDO, AArch64::LDNT1H_4Z_PSEUDO); @@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM_PSEUDO, AArch64::LDNT1W_4Z_PSEUDO); @@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM_PSEUDO, AArch64::LDNT1D_4Z_PSEUDO); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b49754..8312b04 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -162,10 +162,10 @@ static cl::opt<bool> UseFEATCPACodegen( cl::init(false)); /// Value type used for condition codes. -static const MVT MVT_CC = MVT::i32; +constexpr MVT CondCodeVT = MVT::i32; /// Value type used for NZCV flags. -static constexpr MVT FlagsVT = MVT::i32; +constexpr MVT FlagsVT = MVT::i32; static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, @@ -3472,6 +3472,12 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, } } +/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes. +static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC) { + // TODO: Should be TargetConstant (need to s/imm/timm in patterns). + return DAG.getConstant(CC, SDLoc(), CondCodeVT); +} + static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); @@ -3678,7 +3684,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, if (Opcode == 0) Opcode = AArch64ISD::CCMP; - SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); + SDValue Condition = getCondCode(DAG, Predicate); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); @@ -4075,7 +4081,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, Cmp = emitComparison(LHS, RHS, CC, DL, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } - AArch64cc = DAG.getConstant(AArch64CC, DL, MVT_CC); + AArch64cc = getCondCode(DAG, AArch64CC); return Cmp; } @@ -4195,7 +4201,7 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { AArch64CC::CondCode CC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC)); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } @@ -4274,8 +4280,8 @@ static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); - unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; - SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); + AArch64CC::CondCode Cond = Invert ? AArch64CC::LO : AArch64CC::HS; + SDValue CC = getCondCode(DAG, Cond); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } @@ -4285,7 +4291,7 @@ static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) { SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); - SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); + SDValue CC = getCondCode(DAG, AArch64CC::VS); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } @@ -4334,7 +4340,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC)); Overflow = DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow); @@ -7124,8 +7130,7 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, - DAG.getConstant(AArch64CC::PL, DL, MVT::i32), - Cmp.getValue(1)); + getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1)); } static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { @@ -7136,7 +7141,7 @@ static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { AArch64CC::CondCode CC; if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { SDLoc DL(Op); - SDValue CCVal = DAG.getConstant(CC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, CC); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal, Cmp); } @@ -8952,6 +8957,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; @@ -8991,6 +8997,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { @@ -10570,7 +10580,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); - SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, OFCC); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal, Overflow); @@ -10643,7 +10653,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { AArch64CC::isValidCBCond(changeIntCCToAArch64CC(CC)) && ProduceNonFlagSettingCondBr) { SDValue Cond = - DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, MVT::i32); + DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, CondCodeVT); return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS, Dest); } @@ -10662,11 +10672,11 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val, Cmp); } @@ -11155,7 +11165,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be @@ -11168,11 +11178,11 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp); - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp); } return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res; @@ -11200,8 +11210,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get(); ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); - SDValue CCVal = - DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv)); // Inputs are swapped because the condition is inverted. This will allow // matching with a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, @@ -11325,7 +11334,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue AArch64TargetLowering::LowerSELECT_CC( ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, - iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs, + iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -11386,6 +11395,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Canonicalise absolute difference patterns: + // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> + // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc + // + // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> + // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc + // The second forms can be matched into subs+cneg. + if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { + if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); + else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } + unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in @@ -11523,7 +11548,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return true; } })) { - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs(); SDValue VectorCmp = emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG); if (VectorCmp) @@ -11537,7 +11562,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - if (DAG.getTarget().Options.UnsafeFPMath) { + if (Flags.hasNoSignedZeros()) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); @@ -11556,13 +11581,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( } // Emit first, and possibly only, CSEL. - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp); } @@ -11616,10 +11641,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); - bool HasNoNans = Op->getFlags().hasNoNaNs(); + SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, - DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11627,7 +11651,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); - bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11666,7 +11689,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, OFCC); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); @@ -11694,8 +11717,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = - LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); + SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), + Op->getFlags(), DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -12292,7 +12315,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags = SDNodeFlags::AllowReassociation; + // Ensure nodes can be recognized by isAssociativeAndCommutative. + SDNodeFlags Flags = + SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -12504,10 +12529,10 @@ static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { /// WZR, invert(<cond>)'. static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG) { - return DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV); + return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + getCondCode(DAG, getInvertedCondCode(CC)), NZCV); } // Lower @cc flag output via getSETCC. @@ -16674,7 +16699,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath)); + I->getFastMathFlags().allowContract())); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -18678,7 +18703,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, Created.push_back(Cmp.getNode()); Created.push_back(And.getNode()); } else { - SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); + SDValue CCVal = getCondCode(DAG, AArch64CC::MI); SDVTList VTs = DAG.getVTList(VT, FlagsVT); SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); @@ -19550,11 +19575,11 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::AND) { AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); - Condition = DAG.getConstant(InvCC0, DL, MVT_CC); + Condition = getCondCode(DAG, InvCC0); NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); } else { AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); - Condition = DAG.getConstant(CC0, DL, MVT_CC); + Condition = getCondCode(DAG, CC0); NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); } @@ -19575,8 +19600,7 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); } return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), - CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), - CCmp); + CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -19781,7 +19805,7 @@ static SDValue performANDSETCCCombine(SDNode *N, SDLoc DL(N); return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT), - DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp); + getCondCode(DAG, InvertedCC), Cmp); } } return SDValue(); @@ -20772,7 +20796,7 @@ static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { "Unexpected constant value"); SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); - SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, AArch64CC); SDValue Cmp = LHS.getOperand(3); return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); @@ -20958,7 +20982,7 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // (CINC x cc cond) <=> (CSINC x x !cc cond) - SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); + SDValue CC = getCondCode(DAG, AArch64CC::LO); return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); } @@ -22031,7 +22055,7 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, // Convert CC to integer based on requested condition. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. - SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); + SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond)); SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); return DAG.getZExtOrTrunc(Res, DL, VT); } @@ -24112,6 +24136,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -24194,6 +24272,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; @@ -25015,10 +25096,9 @@ static SDValue performBRCONDCombine(SDNode *N, auto CSelCC = getCSETCondCode(CSel); if (CSelCC) { SDLoc DL(N); - return DAG.getNode( - N->getOpcode(), DL, N->getVTList(), Chain, Dest, - DAG.getConstant(getInvertedCondCode(*CSelCC), DL, MVT::i32), - CSel.getOperand(3)); + return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest, + getCondCode(DAG, getInvertedCondCode(*CSelCC)), + CSel.getOperand(3)); } } @@ -25159,7 +25239,7 @@ static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op->getValueType(0); - SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32); + SDValue CCValue = getCondCode(DAG, CC); return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond); } @@ -25236,8 +25316,7 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) { SDValue TValReassoc = Reassociate(TReassocOp, 0); SDValue FValReassoc = Reassociate(FReassocOp, 1); return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc, - DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC), - NewCmp.getValue(1)); + getCondCode(DAG, NewCC), NewCmp.getValue(1)); }; auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2)); @@ -25378,8 +25457,7 @@ static SDValue performCSELCombine(SDNode *N, SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(), Cond.getOperand(1), Cond.getOperand(0)); return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0), - N->getOperand(1), - DAG.getConstant(NewCond, DL, MVT::i32), + N->getOperand(1), getCondCode(DAG, NewCond), Sub.getValue(1)); } } @@ -25479,10 +25557,9 @@ static SDValue performSETCCCombine(SDNode *N, auto NewCond = getInvertedCondCode(OldCond); // csel 0, 1, !cond, X - SDValue CSEL = - DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), - LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), - LHS.getOperand(3)); + SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), + LHS.getOperand(0), LHS.getOperand(1), + getCondCode(DAG, NewCond), LHS.getOperand(3)); return DAG.getZExtOrTrunc(CSEL, DL, VT); } @@ -25552,8 +25629,7 @@ static SDValue performFlagSettingCombine(SDNode *N, // If the flag result isn't used, convert back to a generic opcode. if (!N->hasAnyUseOfValue(1)) { SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); - return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, - DL); + return DCI.CombineTo(N, Res, SDValue(N, 1)); } // Combine identical generic nodes into this node, re-using the result. @@ -26926,6 +27002,23 @@ static SDValue performSHLCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS); } +static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntrinsicID = N->getConstantOperandVal(1); + auto Register = + (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR + : AArch64SysReg::RNDRRS); + SDLoc DL(N); + SDValue A = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), + N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); + SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + getCondCode(DAG, AArch64CC::NE), A.getValue(1)); + return DAG.getMergeValues( + {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -27241,22 +27334,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: - case Intrinsic::aarch64_rndrrs: { - unsigned IntrinsicID = N->getConstantOperandVal(1); - auto Register = - (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR - : AArch64SysReg::RNDRRS); - SDLoc DL(N); - SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), - N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); - return DAG.getMergeValues( - {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); - } + case Intrinsic::aarch64_rndrrs: + return performRNDRCombine(N, DAG); case Intrinsic::aarch64_sme_ldr_zt: return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), DAG.getVTList(MVT::Other), N->getOperand(0), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 95d0e3b..ea63edd8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -662,7 +662,7 @@ private: SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, iterator_range<SDNode::user_iterator> Users, - bool HasNoNans, const SDLoc &dl, + SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8685d7a0..59d4fd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || + return Options.AllowFPOpFusion == FPOpFusion::Fast || Inst.getFlag(MachineInstr::FmContract); - return true; } return false; } @@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, case AArch64::FMUL_ZZZ_H: case AArch64::FMUL_ZZZ_S: case AArch64::FMUL_ZZZ_D: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || - (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz)); + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // == Integer types == // -- Base instructions -- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 07cacfa..ac31236 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -448,8 +448,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisVT<1, FlagsVT>, SDTCisVT<4, FlagsVT>]>; +// Value type used for condition codes. +// Should be kept in sync with its C++ counterpart. +defvar CondCodeVT = i32; + def SDT_AArch64Brcond : SDTypeProfile<0, 3, - [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, CondCodeVT>, SDTCisVT<2, FlagsVT>]>; def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, @@ -458,22 +463,22 @@ def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<3>, + SDTCisVT<3, CondCodeVT>, SDTCisVT<4, FlagsVT>]>; def SDT_AArch64CCMP : SDTypeProfile<1, 5, [SDTCisVT<0, FlagsVT>, SDTCisInt<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; + SDTCisVT<4, CondCodeVT>, + SDTCisVT<5, FlagsVT>]>; def SDT_AArch64FCCMP : SDTypeProfile<1, 5, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; + SDTCisVT<4, CondCodeVT>, + SDTCisVT<5, FlagsVT>]>; def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; @@ -546,7 +551,8 @@ def SDT_AArch64TBL : SDTypeProfile<1, 2, [ ]>; def SDT_AArch64cb : SDTypeProfile<0, 4, - [SDTCisVT<0, i32>, SDTCisInt<1>, SDTCisInt<2>, + [SDTCisVT<0, CondCodeVT>, + SDTCisInt<1>, SDTCisInt<2>, SDTCisVT<3, OtherVT>]>; // non-extending masked load fragment. @@ -6668,6 +6674,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index abcd550..b97d622 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -12,7 +12,7 @@ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// MOVi64imm + ADDXrr ==> ADDXri + ADDXri // // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi // MOVi64imm + SUBXrr ==> SUBXri + SUBXri @@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { template <typename T> bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + // Strategy used to split logical immediate bitmasks. + enum class SplitStrategy { + Intersect, + }; template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); + bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); - if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) - return false; - - // If this immediate can be handled by one instruction, do not split it. - SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; - AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); - if (Insn.size() == 1) - return false; // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, - unsigned OtherOpc) { +bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, + unsigned OtherOpc) { // Try below transformation. // // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri @@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, return splitTwoPartImm<T>( MI, - [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { - if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { + // If this immediate is already a suitable bitmask, don't split it. + // TODO: Should we just combine the two instructions in this case? + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return std::nullopt; + + // If this immediate can be handled by one instruction, don't split it. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return std::nullopt; + + bool SplitSucc = false; + switch (Strategy) { + case SplitStrategy::Intersect: + SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; + } + if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, @@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= visitINSERT(MI); break; case AArch64::ANDWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDSWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + Changed |= trySplitLogicalImm<uint32_t>( + AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri); break; case AArch64::ANDSXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + Changed |= trySplitLogicalImm<uint64_t>( + AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; case AArch64::ORRWrs: Changed |= visitORR(MI); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 40f49da..e1adc0b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -270,6 +270,13 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMECallAttrs CallAttrs(*Caller, *Callee); + // Never inline a function explicitly marked as being streaming, + // into a non-streaming function. Assume it was marked as streaming + // for a reason. + if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() && + CallAttrs.callee().hasStreamingInterfaceOrBody()) + return false; + // When inlining, we should consider the body of the function, not the // interface. if (CallAttrs.callee().hasStreamingBody()) { @@ -4905,14 +4912,17 @@ void AArch64TTIImpl::getUnrollingPreferences( // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; - // No need to unroll auto-vectorized loops - if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) - return; - // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. + // inlining. Don't unroll auto-vectorized loops either, though do allow + // unrolling of the scalar remainder. + bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + // Both auto-vectorized loops and the scalar remainder have the + // isvectorized attribute, so differentiate between them by the presence + // of vector instructions. + if (IsVectorized && I.getType()->isVectorTy()) + return; if (isa<CallBase>(I)) { if (isa<CallInst>(I) || isa<InvokeInst>(I)) if (const Function *F = cast<CallBase>(I).getCalledFunction()) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 0b79850..1a15075 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -50,8 +50,10 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub, // // %sub = G_SUB 0, %y // %cmp = G_ICMP eq/ne, %z, %sub + // or with signed comparisons with the no-signed-wrap flag set if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB || - !CmpInst::isEquality(Pred)) + (!CmpInst::isEquality(Pred) && + !(CmpInst::isSigned(Pred) && MaybeSub->getFlag(MachineInstr::NoSWrap)))) return false; auto MaybeZero = getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 1381a9b..d905692 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1810,7 +1810,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp( // Couldn't optimize. Emit a compare + a Bcc. MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - auto PredOp = ICmp.getOperand(1); + auto &PredOp = ICmp.getOperand(1); emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( static_cast<CmpInst::Predicate>(PredOp.getPredicate())); @@ -2506,12 +2506,12 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { return false; } auto &PredOp = Cmp->getOperand(1); - auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - const AArch64CC::CondCode InvCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); MIB.setInstrAndDebugLoc(I); emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), /*RHS=*/Cmp->getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); + const AArch64CC::CondCode InvCC = + changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); I.eraseFromParent(); return true; @@ -3574,10 +3574,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } - auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); + auto &PredOp = I.getOperand(1); + emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); - emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, /*Src2=*/AArch64::WZR, InvCC, MIB); I.eraseFromParent(); @@ -5096,11 +5097,11 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { - auto Pred = - static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); + auto &PredOp = CondDef->getOperand(1); + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp, + MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); CondCode = changeICMPPredToAArch64CC(Pred); - emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB); } else { // Get the condition code for the select. auto Pred = @@ -5148,29 +5149,37 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); + // Given this: // // x = G_SUB 0, y - // G_ICMP x, z + // G_ICMP z, x // // Produce this: // - // cmn y, z - if (isCMN(LHSDef, P, MRI)) - return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + // cmn z, y + if (isCMN(RHSDef, P, MRI)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); - // Same idea here, but with the RHS of the compare instead: + // Same idea here, but with the LHS of the compare instead: // // Given this: // // x = G_SUB 0, y - // G_ICMP z, x + // G_ICMP x, z // // Produce this: // - // cmn z, y - if (isCMN(RHSDef, P, MRI)) - return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + // cmn y, z + // + // But be careful! We need to swap the predicate! + if (isCMN(LHSDef, P, MRI)) { + if (!CmpInst::isEquality(P)) { + P = CmpInst::getSwappedPredicate(P); + Predicate = MachineOperand::CreatePredicate(P); + } + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + } // Given this: // diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bb0f667b..e0e1af7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; }; + auto LowerTriOp = [&MI, &MIB](unsigned Opcode) { + MIB.buildInstr(Opcode, {MI.getOperand(0)}, + {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)}); + MI.eraseFromParent(); + return true; + }; Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { @@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_USUBSAT); break; } + case Intrinsic::aarch64_neon_udot: + return LowerTriOp(AArch64::G_UDOT); + case Intrinsic::aarch64_neon_sdot: + return LowerTriOp(AArch64::G_SDOT); case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 1cd9453..8c10673 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -228,12 +228,13 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); } -// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) -// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) +// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1)) // Similar to performVecReduceAddCombine in SelectionDAG -bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -246,31 +247,57 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) return false; - LLT SrcTy; - auto I1Opc = I1->getOpcode(); - if (I1Opc == TargetOpcode::G_MUL) { + // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT + // then the ext's must match the same opcode. It is set to the ext opcode on + // output. + auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1, + Register &Out2, unsigned &I1Opc) { // If result of this has more than 1 use, then there is no point in creating - // udot instruction - if (!MRI.hasOneNonDBGUse(MidReg)) + // a dot instruction + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; MachineInstr *ExtMI1 = - getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); MachineInstr *ExtMI2 = - getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(2).getReg(), MRI); LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) return false; + if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) && + I1Opc != ExtMI1->getOpcode()) + return false; + Out1 = ExtMI1->getOperand(1).getReg(); + Out2 = ExtMI2->getOperand(1).getReg(); I1Opc = ExtMI1->getOpcode(); - SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); + return true; + }; + + LLT SrcTy; + unsigned I1Opc = I1->getOpcode(); + if (I1Opc == TargetOpcode::G_MUL) { + Register Out1, Out2; + if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc)) + return false; + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; } else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) { - SrcTy = MRI.getType(I1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = 0; + Register I1Op = I1->getOperand(1).getReg(); + MachineInstr *M = getDefIgnoringCopies(I1Op, MRI); + Register Out1, Out2; + if (M->getOpcode() == TargetOpcode::G_MUL && + tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) { + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; + } else { + SrcTy = MRI.getType(I1Op); + std::get<0>(MatchInfo) = I1Op; + std::get<1>(MatchInfo) = 0; + } } else { return false; } @@ -288,11 +315,11 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } -void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &Builder, - GISelChangeObserver &Observer, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &Builder, + GISelChangeObserver &Observer, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -553,15 +580,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } -// Pushes ADD/SUB through extend instructions to decrease the number of extend -// instruction at the end by allowing selection of {s|u}addl sooner - +// Pushes ADD/SUB/MUL through extend instructions to decrease the number of +// extend instruction at the end by allowing selection of {s|u}addl sooner // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, Register DstReg, Register SrcReg1, Register SrcReg2) { assert((MI.getOpcode() == TargetOpcode::G_ADD || - MI.getOpcode() == TargetOpcode::G_SUB) && - "Expected a G_ADD or G_SUB instruction\n"); + MI.getOpcode() == TargetOpcode::G_SUB || + MI.getOpcode() == TargetOpcode::G_MUL) && + "Expected a G_ADD, G_SUB or G_MUL instruction\n"); // Deal with vector types only LLT DstTy = MRI.getType(DstReg); @@ -594,9 +621,10 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); // G_SUB has to sign-extend the result. - // G_ADD needs to sext from sext and can sext or zext from zext, so the - // original opcode is used. - if (MI.getOpcode() == TargetOpcode::G_ADD) + // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL + // needs to use the original opcode so the original opcode is used for both. + if (MI.getOpcode() == TargetOpcode::G_ADD || + MI.getOpcode() == TargetOpcode::G_MUL) B.buildInstr(Opc, {DstReg}, {AddReg}); else B.buildSExt(DstReg, AddReg); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index b9d3e1b..6912caf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -461,7 +461,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); |