aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp135
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp57
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp13
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp47
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp90
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp3
13 files changed, 292 insertions, 129 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ca09598..99f0af5 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -39,8 +39,8 @@ let Predicates = [HasDotProd] in {
def ext_addv_to_udot_addv : GICombineRule<
(defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo),
(match (wip_match_opcode G_VECREDUCE_ADD):$root,
- [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]),
- (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }])
+ [{ return matchExtAddvToDotAddv(*${root}, MRI, STI, ${matchinfo}); }]),
+ (apply [{ applyExtAddvToDotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }])
>;
}
@@ -62,8 +62,10 @@ class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICom
def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
+def push_mul_through_zext : push_opcode_through_ext<G_MUL, G_ZEXT>;
def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;
+def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>;
def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
@@ -75,8 +77,10 @@ def AArch64PreLegalizerCombiner: GICombiner<
ext_uaddv_to_uaddlv,
push_sub_through_zext,
push_add_through_zext,
+ push_mul_through_zext,
push_sub_through_sext,
- push_add_through_sext]> {
+ push_add_through_sext,
+ push_mul_through_sext]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index eca7ca5..ad42f4b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ld1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(
Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO);
else if (Subtarget->hasSVE2p1())
@@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x2: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 0,
AArch64::LDNT1B_2Z_IMM_PSEUDO,
AArch64::LDNT1B_2Z_PSEUDO);
@@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 1,
AArch64::LDNT1H_2Z_IMM_PSEUDO,
AArch64::LDNT1H_2Z_PSEUDO);
@@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 2,
AArch64::LDNT1W_2Z_IMM_PSEUDO,
AArch64::LDNT1W_2Z_PSEUDO);
@@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 2, 3,
AArch64::LDNT1D_2Z_IMM_PSEUDO,
AArch64::LDNT1D_2Z_PSEUDO);
@@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
case Intrinsic::aarch64_sve_ldnt1_pn_x4: {
if (VT == MVT::nxv16i8) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 0,
AArch64::LDNT1B_4Z_IMM_PSEUDO,
AArch64::LDNT1B_4Z_PSEUDO);
@@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
VT == MVT::nxv8bf16) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 1,
AArch64::LDNT1H_4Z_IMM_PSEUDO,
AArch64::LDNT1H_4Z_PSEUDO);
@@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 2,
AArch64::LDNT1W_4Z_IMM_PSEUDO,
AArch64::LDNT1W_4Z_PSEUDO);
@@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
return;
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- if (Subtarget->hasSME2())
+ if (Subtarget->hasSME2() && Subtarget->isStreaming())
SelectContiguousMultiVectorLoad(Node, 4, 3,
AArch64::LDNT1D_4Z_IMM_PSEUDO,
AArch64::LDNT1D_4Z_PSEUDO);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b49754..4f6e3dd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8952,6 +8952,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID &CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
+ const CallBase *CB = CLI.CB;
MachineFunction &MF = DAG.getMachineFunction();
MachineFunction::CallSiteInfo CSInfo;
@@ -8991,6 +8992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
*DAG.getContext());
RetCCInfo.AnalyzeCallResult(Ins, RetCC);
+ // Set type id for call site info.
+ if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
+ CSInfo = MachineFunction::CallSiteInfo(*CB);
+
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
@@ -11325,7 +11330,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
SDValue AArch64TargetLowering::LowerSELECT_CC(
ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
- iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+ iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,
const SDLoc &DL, SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
@@ -11386,6 +11391,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
+ // Canonicalise absolute difference patterns:
+ // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
+ // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
+ //
+ // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
+ // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
+ // The second forms can be matched into subs+cneg.
+ if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
+ if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
+ FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS)
+ FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
+ else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
+ FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS)
+ TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
+ }
+
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
@@ -11523,7 +11544,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return true;
}
})) {
- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
SDValue VectorCmp =
emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
if (VectorCmp)
@@ -11537,7 +11558,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
- if (DAG.getTarget().Options.UnsafeFPMath) {
+ if (Flags.hasNoSignedZeros()) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
@@ -11616,10 +11637,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
+ SDNodeFlags Flags = Op->getFlags();
SDLoc DL(Op);
- return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
- DAG);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11627,7 +11647,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
- bool HasNoNans = Op->getFlags().hasNoNaNs();
SDLoc DL(Op);
EVT Ty = Op.getValueType();
@@ -11694,8 +11713,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
DAG.getUNDEF(MVT::f32), FVal);
}
- SDValue Res =
- LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
+ SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
+ Op->getFlags(), DL, DAG);
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -12292,7 +12311,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
- SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
+ // Ensure nodes can be recognized by isAssociativeAndCommutative.
+ SDNodeFlags Flags =
+ SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -16674,7 +16695,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath));
+ I->getFastMathFlags().allowContract()));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -24112,6 +24133,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}
+// Combine store (fp_to_int X) to use vector semantics around the conversion
+// when NEON is available. This allows us to store the in-vector result directly
+// without transferring the result into a GPR in the process.
+static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // Limit to post-legalization in order to avoid peeling truncating stores.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+ // Source operand is already a vector.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType().isVector())
+ return SDValue();
+
+ // Look through potential assertions.
+ while (Value->isAssert())
+ Value = Value.getOperand(0);
+
+ if (Value.getOpcode() != ISD::FP_TO_SINT &&
+ Value.getOpcode() != ISD::FP_TO_UINT)
+ return SDValue();
+ if (!Value->hasOneUse())
+ return SDValue();
+
+ SDValue FPSrc = Value.getOperand(0);
+ EVT SrcVT = FPSrc.getValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // No support for assignments such as i64 = fp_to_sint i32
+ EVT VT = Value.getSimpleValueType();
+ if (VT != SrcVT.changeTypeToInteger())
+ return SDValue();
+
+ // Create a 128-bit element vector to avoid widening. The floating point
+ // conversion is transformed into a single element conversion via a pattern.
+ unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
+ EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
+ EVT VecDstVT = VecSrcVT.changeTypeToInteger();
+ SDLoc DL(ST);
+ SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
+ SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
+
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
+
+ DCI.CombineTo(ST->getValue().getNode(), Extracted);
+ return SDValue(ST, 0);
+}
+
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24194,6 +24269,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);
+ if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
+ return Res;
+
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
@@ -26926,6 +27004,23 @@ static SDValue performSHLCombine(SDNode *N,
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
}
+static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
+ unsigned IntrinsicID = N->getConstantOperandVal(1);
+ auto Register =
+ (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
+ : AArch64SysReg::RNDRRS);
+ SDLoc DL(N);
+ SDValue A = DAG.getNode(
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
+ N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
+ SDValue B = DAG.getNode(
+ AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
+ return DAG.getMergeValues(
+ {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -27241,22 +27336,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
case Intrinsic::aarch64_rndr:
- case Intrinsic::aarch64_rndrrs: {
- unsigned IntrinsicID = N->getConstantOperandVal(1);
- auto Register =
- (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
- : AArch64SysReg::RNDRRS);
- SDLoc DL(N);
- SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
- N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
- SDValue B = DAG.getNode(
- AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
- return DAG.getMergeValues(
- {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
- }
+ case Intrinsic::aarch64_rndrrs:
+ return performRNDRCombine(N, DAG);
case Intrinsic::aarch64_sme_ldr_zt:
return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
DAG.getVTList(MVT::Other), N->getOperand(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 95d0e3b..ea63edd8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -662,7 +662,7 @@ private:
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
SDValue TVal, SDValue FVal,
iterator_range<SDNode::user_iterator> Users,
- bool HasNoNans, const SDLoc &dl,
+ SDNodeFlags Flags, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8685d7a0..59d4fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6574,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
// the target options or if FADD/FSUB has the contract fast-math flag.
- return Options.UnsafeFPMath ||
- Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ return Options.AllowFPOpFusion == FPOpFusion::Fast ||
Inst.getFlag(MachineInstr::FmContract);
- return true;
}
return false;
}
@@ -6680,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
case AArch64::FMUL_ZZZ_H:
case AArch64::FMUL_ZZZ_S:
case AArch64::FMUL_ZZZ_D:
- return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
- (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
- Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
// == Integer types ==
// -- Base instructions --
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07cacfa..251fd44 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
+def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
+
// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index abcd550..b97d622 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -12,7 +12,7 @@
// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
-// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
//
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
template <typename T>
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+ // Strategy used to split logical immediate bitmasks.
+ enum class SplitStrategy {
+ Intersect,
+ };
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
+ bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
- if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
- return false;
-
- // If this immediate can be handled by one instruction, do not split it.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
- if (Insn.size() == 1)
- return false;
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
- unsigned OtherOpc) {
+bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy,
+ unsigned OtherOpc) {
// Try below transformation.
//
// MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
@@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
return splitTwoPartImm<T>(
MI,
- [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
- if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+ [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
+ // If this immediate is already a suitable bitmask, don't split it.
+ // TODO: Should we just combine the two instructions in this case?
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return std::nullopt;
+
+ // If this immediate can be handled by one instruction, don't split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return std::nullopt;
+
+ bool SplitSucc = false;
+ switch (Strategy) {
+ case SplitStrategy::Intersect:
+ SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ }
+ if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
@@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= visitINSERT(MI);
break;
case AArch64::ANDWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDSWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ Changed |= trySplitLogicalImm<uint32_t>(
+ AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
break;
case AArch64::ANDSXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ Changed |= trySplitLogicalImm<uint64_t>(
+ AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 40f49da..18ca22f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4905,14 +4905,17 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- // No need to unroll auto-vectorized loops
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
- return;
-
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining.
+ // inlining. Don't unroll auto-vectorized loops either, though do allow
+ // unrolling of the scalar remainder.
+ bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ // Both auto-vectorized loops and the scalar remainder have the
+ // isvectorized attribute, so differentiate between them by the presence
+ // of vector instructions.
+ if (IsVectorized && I.getType()->isVectorTy())
+ return;
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
index 0b79850..1a15075 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
@@ -50,8 +50,10 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub,
//
// %sub = G_SUB 0, %y
// %cmp = G_ICMP eq/ne, %z, %sub
+ // or with signed comparisons with the no-signed-wrap flag set
if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB ||
- !CmpInst::isEquality(Pred))
+ (!CmpInst::isEquality(Pred) &&
+ !(CmpInst::isSigned(Pred) && MaybeSub->getFlag(MachineInstr::NoSWrap))))
return false;
auto MaybeZero =
getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 1381a9b..d905692 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1810,7 +1810,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
// Couldn't optimize. Emit a compare + a Bcc.
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
- auto PredOp = ICmp.getOperand(1);
+ auto &PredOp = ICmp.getOperand(1);
emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
@@ -2506,12 +2506,12 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
return false;
}
auto &PredOp = Cmp->getOperand(1);
- auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
MIB.setInstrAndDebugLoc(I);
emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
/*RHS=*/Cmp->getOperand(3), PredOp, MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
+ const AArch64CC::CondCode InvCC =
+ changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
I.eraseFromParent();
return true;
@@ -3574,10 +3574,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return false;
}
- auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+ auto &PredOp = I.getOperand(1);
+ emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
const AArch64CC::CondCode InvCC =
changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
- emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
/*Src2=*/AArch64::WZR, InvCC, MIB);
I.eraseFromParent();
@@ -5096,11 +5097,11 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
- auto Pred =
- static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
+ auto &PredOp = CondDef->getOperand(1);
+ emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp,
+ MIB);
+ auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
CondCode = changeICMPPredToAArch64CC(Pred);
- emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
- CondDef->getOperand(1), MIB);
} else {
// Get the condition code for the select.
auto Pred =
@@ -5148,29 +5149,37 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
+
// Given this:
//
// x = G_SUB 0, y
- // G_ICMP x, z
+ // G_ICMP z, x
//
// Produce this:
//
- // cmn y, z
- if (isCMN(LHSDef, P, MRI))
- return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
+ // cmn z, y
+ if (isCMN(RHSDef, P, MRI))
+ return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
- // Same idea here, but with the RHS of the compare instead:
+ // Same idea here, but with the LHS of the compare instead:
//
// Given this:
//
// x = G_SUB 0, y
- // G_ICMP z, x
+ // G_ICMP x, z
//
// Produce this:
//
- // cmn z, y
- if (isCMN(RHSDef, P, MRI))
- return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
+ // cmn y, z
+ //
+ // But be careful! We need to swap the predicate!
+ if (isCMN(LHSDef, P, MRI)) {
+ if (!CmpInst::isEquality(P)) {
+ P = CmpInst::getSwappedPredicate(P);
+ Predicate = MachineOperand::CreatePredicate(P);
+ }
+ return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
+ }
// Given this:
//
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index bb0f667b..e0e1af7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
};
+ auto LowerTriOp = [&MI, &MIB](unsigned Opcode) {
+ MIB.buildInstr(Opcode, {MI.getOperand(0)},
+ {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)});
+ MI.eraseFromParent();
+ return true;
+ };
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
@@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(TargetOpcode::G_USUBSAT);
break;
}
+ case Intrinsic::aarch64_neon_udot:
+ return LowerTriOp(AArch64::G_UDOT);
+ case Intrinsic::aarch64_neon_sdot:
+ return LowerTriOp(AArch64::G_SDOT);
case Intrinsic::vector_reverse:
// TODO: Add support for vector_reverse
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 1cd9453..8c10673 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -228,12 +228,13 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
}
-// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
-// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
+// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y))
+// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y))
+// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1))
// Similar to performVecReduceAddCombine in SelectionDAG
-bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
- const AArch64Subtarget &STI,
- std::tuple<Register, Register, bool> &MatchInfo) {
+bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+ const AArch64Subtarget &STI,
+ std::tuple<Register, Register, bool> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
"Expected a G_VECREDUCE_ADD instruction");
assert(STI.hasDotProd() && "Target should have Dot Product feature");
@@ -246,31 +247,57 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
return false;
- LLT SrcTy;
- auto I1Opc = I1->getOpcode();
- if (I1Opc == TargetOpcode::G_MUL) {
+ // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT
+ // then the ext's must match the same opcode. It is set to the ext opcode on
+ // output.
+ auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1,
+ Register &Out2, unsigned &I1Opc) {
// If result of this has more than 1 use, then there is no point in creating
- // udot instruction
- if (!MRI.hasOneNonDBGUse(MidReg))
+ // a dot instruction
+ if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
return false;
MachineInstr *ExtMI1 =
- getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
+ getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
MachineInstr *ExtMI2 =
- getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
+ getDefIgnoringCopies(MI->getOperand(2).getReg(), MRI);
LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
return false;
+ if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) &&
+ I1Opc != ExtMI1->getOpcode())
+ return false;
+ Out1 = ExtMI1->getOperand(1).getReg();
+ Out2 = ExtMI2->getOperand(1).getReg();
I1Opc = ExtMI1->getOpcode();
- SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
- std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
- std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
+ return true;
+ };
+
+ LLT SrcTy;
+ unsigned I1Opc = I1->getOpcode();
+ if (I1Opc == TargetOpcode::G_MUL) {
+ Register Out1, Out2;
+ if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc))
+ return false;
+ SrcTy = MRI.getType(Out1);
+ std::get<0>(MatchInfo) = Out1;
+ std::get<1>(MatchInfo) = Out2;
} else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) {
- SrcTy = MRI.getType(I1->getOperand(1).getReg());
- std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
- std::get<1>(MatchInfo) = 0;
+ Register I1Op = I1->getOperand(1).getReg();
+ MachineInstr *M = getDefIgnoringCopies(I1Op, MRI);
+ Register Out1, Out2;
+ if (M->getOpcode() == TargetOpcode::G_MUL &&
+ tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) {
+ SrcTy = MRI.getType(Out1);
+ std::get<0>(MatchInfo) = Out1;
+ std::get<1>(MatchInfo) = Out2;
+ } else {
+ SrcTy = MRI.getType(I1Op);
+ std::get<0>(MatchInfo) = I1Op;
+ std::get<1>(MatchInfo) = 0;
+ }
} else {
return false;
}
@@ -288,11 +315,11 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
return true;
}
-void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &Builder,
- GISelChangeObserver &Observer,
- const AArch64Subtarget &STI,
- std::tuple<Register, Register, bool> &MatchInfo) {
+void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder,
+ GISelChangeObserver &Observer,
+ const AArch64Subtarget &STI,
+ std::tuple<Register, Register, bool> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
"Expected a G_VECREDUCE_ADD instruction");
assert(STI.hasDotProd() && "Target should have Dot Product feature");
@@ -553,15 +580,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
-// Pushes ADD/SUB through extend instructions to decrease the number of extend
-// instruction at the end by allowing selection of {s|u}addl sooner
-
+// Pushes ADD/SUB/MUL through extend instructions to decrease the number of
+// extend instruction at the end by allowing selection of {s|u}addl sooner
// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
Register DstReg, Register SrcReg1, Register SrcReg2) {
assert((MI.getOpcode() == TargetOpcode::G_ADD ||
- MI.getOpcode() == TargetOpcode::G_SUB) &&
- "Expected a G_ADD or G_SUB instruction\n");
+ MI.getOpcode() == TargetOpcode::G_SUB ||
+ MI.getOpcode() == TargetOpcode::G_MUL) &&
+ "Expected a G_ADD, G_SUB or G_MUL instruction\n");
// Deal with vector types only
LLT DstTy = MRI.getType(DstReg);
@@ -594,9 +621,10 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
// G_SUB has to sign-extend the result.
- // G_ADD needs to sext from sext and can sext or zext from zext, so the
- // original opcode is used.
- if (MI.getOpcode() == TargetOpcode::G_ADD)
+ // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL
+ // needs to use the original opcode so the original opcode is used for both.
+ if (MI.getOpcode() == TargetOpcode::G_ADD ||
+ MI.getOpcode() == TargetOpcode::G_MUL)
B.buildInstr(Opc, {DstReg}, {AddReg});
else
B.buildSExt(DstReg, AddReg);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1ac340a..a22a17a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
// But only if they don't point to a few forbidden sections.
if (!Symbol.isInSection())
return true;
- const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+ const MCSectionMachO &RefSec =
+ static_cast<MCSectionMachO &>(Symbol.getSection());
if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
return false;