From ca423a26e7bfc31a36c9ad790b0ae1bb9be18836 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 19 Jun 2024 14:25:57 +0100 Subject: [AArch64] Avoid using NEON BSL for streaming[-compatible] functions (#95803) --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 +- .../sve-streaming-mode-fixed-length-bitselect.ll | 99 ++++++++++++++-------- 2 files changed, 71 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0f0606c..c790209 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18244,9 +18244,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (!VT.isVector()) return SDValue(); - // The combining code works for NEON, SVE2 and SME. - if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) || - (VT.isScalableVector() && !Subtarget.hasSVE2())) + if (VT.isScalableVector() && !Subtarget.hasSVE2()) + return SDValue(); + + if (VT.isFixedLengthVector() && + (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) return SDValue(); SDValue N0 = N->getOperand(0); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index b908dd6..d65e87d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -34,39 +34,72 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r ; ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x2] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #60] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #56] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #52] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #48] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #44] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #40] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #36] -; NONEON-NOSVE-NEXT: ldr w8, [sp] -; NONEON-NOSVE-NEXT: neg w8, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #32] -; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: add sp, sp, #64 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x2] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w8, w14, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w9, w4, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w13, w11, [sp, #56] +; NONEON-NOSVE-NEXT: neg w3, w8 +; NONEON-NOSVE-NEXT: neg w15, w14 +; NONEON-NOSVE-NEXT: str q4, [sp, #32] +; NONEON-NOSVE-NEXT: and w9, w3, w9 +; NONEON-NOSVE-NEXT: and w15, w15, w4 +; NONEON-NOSVE-NEXT: str q5, [sp, #80] +; NONEON-NOSVE-NEXT: ldp w5, w3, [sp, #72] +; NONEON-NOSVE-NEXT: ldp w16, w12, [sp] +; NONEON-NOSVE-NEXT: neg w4, w11 +; NONEON-NOSVE-NEXT: neg w2, w13 +; NONEON-NOSVE-NEXT: sub w11, w11, #1 +; NONEON-NOSVE-NEXT: and w3, w4, w3 +; NONEON-NOSVE-NEXT: and w2, w2, w5 +; NONEON-NOSVE-NEXT: sub w13, w13, #1 +; NONEON-NOSVE-NEXT: ldp w6, w4, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w17, [sp, #8] +; NONEON-NOSVE-NEXT: neg w1, w16 +; NONEON-NOSVE-NEXT: neg w0, w12 +; NONEON-NOSVE-NEXT: sub w16, w16, #1 +; NONEON-NOSVE-NEXT: and w1, w1, w6 +; NONEON-NOSVE-NEXT: and w0, w0, w4 +; NONEON-NOSVE-NEXT: sub w12, w12, #1 +; NONEON-NOSVE-NEXT: ldp w5, w6, [sp, #24] +; NONEON-NOSVE-NEXT: neg w18, w17 +; NONEON-NOSVE-NEXT: neg w4, w10 +; NONEON-NOSVE-NEXT: sub w17, w17, #1 +; NONEON-NOSVE-NEXT: sub w10, w10, #1 +; NONEON-NOSVE-NEXT: sub w14, w14, #1 +; NONEON-NOSVE-NEXT: sub w8, w8, #1 +; NONEON-NOSVE-NEXT: and w4, w4, w5 +; NONEON-NOSVE-NEXT: and w18, w18, w6 +; NONEON-NOSVE-NEXT: ldp w5, w6, [sp, #32] +; NONEON-NOSVE-NEXT: and w16, w16, w5 +; NONEON-NOSVE-NEXT: and w12, w12, w6 +; NONEON-NOSVE-NEXT: ldp w5, w6, [sp, #40] +; NONEON-NOSVE-NEXT: and w10, w10, w5 +; NONEON-NOSVE-NEXT: and w17, w17, w6 +; NONEON-NOSVE-NEXT: orr w17, w17, w18 +; NONEON-NOSVE-NEXT: orr w10, w10, w4 +; NONEON-NOSVE-NEXT: ldp w18, w4, [sp, #88] +; NONEON-NOSVE-NEXT: ldp w5, w6, [sp, #80] +; NONEON-NOSVE-NEXT: stp w10, w17, [sp, #104] +; NONEON-NOSVE-NEXT: orr w10, w12, w0 +; NONEON-NOSVE-NEXT: orr w12, w16, w1 +; NONEON-NOSVE-NEXT: and w11, w11, w4 +; NONEON-NOSVE-NEXT: stp w12, w10, [sp, #96] +; NONEON-NOSVE-NEXT: and w10, w13, w18 +; NONEON-NOSVE-NEXT: orr w11, w11, w3 +; NONEON-NOSVE-NEXT: and w12, w14, w6 +; NONEON-NOSVE-NEXT: orr w10, w10, w2 +; NONEON-NOSVE-NEXT: and w8, w8, w5 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #120] +; NONEON-NOSVE-NEXT: orr w10, w12, w15 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr %left = load <8 x i32>, ptr %left_ptr -- cgit v1.1