aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Walker <paul.walker@arm.com>2024-05-01 11:10:47 +0100
committerGitHub <noreply@github.com>2024-05-01 11:10:47 +0100
commitfdf206c10ccea9dd5bd3d7eeb1381f305f972e52 (patch)
tree8cd0d936b4e97533352efaa4ffbe75eab79fa22a
parent9bebf25ecbe6a8720dd581bd2a4f8d29aa763a42 (diff)
downloadllvm-fdf206c10ccea9dd5bd3d7eeb1381f305f972e52.zip
llvm-fdf206c10ccea9dd5bd3d7eeb1381f305f972e52.tar.gz
llvm-fdf206c10ccea9dd5bd3d7eeb1381f305f972e52.tar.bz2
[LLVM][SVE] Improve legalisation of fixed length get.active.lane.mask (#90213)
We are effectively performing type and operation legalisation very early within the code generation flow. This results in worse code quality because the DAG is not in canonical form, which DAGCombiner corrects through the introduction of operations that are not legal. This patchs splits and moves the code to where type and operation legalisation is typically implemented.
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp80
-rw-r--r--llvm/test/CodeGen/AArch64/active_lane_mask.ll36
2 files changed, 61 insertions, 55 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cb7930f..2af679e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1677,6 +1677,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
+ for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
}
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -5748,8 +5751,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::get_active_lane_mask: {
SDValue ID =
DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
- Op.getOperand(1), Op.getOperand(2));
+
+ EVT VT = Op.getValueType();
+ if (VT.isScalableVector())
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
+ Op.getOperand(2));
+
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+ SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
+ Op.getOperand(1), Op.getOperand(2));
+ SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
+ DAG.getVectorIdxConstant(0, dl));
}
case Intrinsic::aarch64_neon_uaddlv: {
EVT OpVT = Op.getOperand(1).getValueType();
@@ -20530,39 +20549,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
-
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
-
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
-
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
-
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
-
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
- return Res;
- }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -25636,8 +25622,6 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
Intrinsic::ID IntID =
static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
@@ -25645,6 +25629,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
default:
return;
case Intrinsic::aarch64_sve_clasta_n: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
@@ -25653,6 +25639,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_clastb_n: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
@@ -25661,6 +25649,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_lasta: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
@@ -25668,12 +25658,28 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_lastb: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
return;
}
+ case Intrinsic::get_active_lane_mask: {
+ if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
+ return;
+
+ // NOTE: Only trivial type promotion is supported.
+ EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
+ return;
+
+ SDLoc DL(N);
+ auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
}
}
case ISD::READ_REGISTER: {
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 43122c8..bd5d076 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -353,9 +353,9 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.h, w0, w1
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: whilelo p0.b, w0, w1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
ret <8 x i1> %active.lane.mask
@@ -364,9 +364,9 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.s, w0, w1
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: whilelo p0.h, w0, w1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
ret <4 x i1> %active.lane.mask
@@ -375,9 +375,9 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.d, w0, w1
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: whilelo p0.s, w0, w1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
ret <2 x i1> %active.lane.mask
@@ -397,9 +397,9 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.h, x0, x1
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: whilelo p0.b, x0, x1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC)
ret <8 x i1> %active.lane.mask
@@ -408,9 +408,9 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.s, x0, x1
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: whilelo p0.h, x0, x1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC)
ret <4 x i1> %active.lane.mask
@@ -419,9 +419,9 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.d, x0, x1
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: whilelo p0.s, x0, x1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)
ret <2 x i1> %active.lane.mask