diff options
author | Lukacma <Marian.Lukac@arm.com> | 2024-06-25 10:58:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-25 10:58:16 +0200 |
commit | 0bd9c49a295829ed44e838c4d54cc905662a1afa (patch) | |
tree | 05899f7b4a0d4f9ebbeb96249069f369e26dc90f | |
parent | f6aa50873463ebd9a459b7ccd4989460175a6e7f (diff) | |
download | llvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.zip llvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.tar.gz llvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.tar.bz2 |
[AArch64][SVE] optimisation for SVE load intrinsics with no active lanes (#95269)
This patch extends #73964 and adds optimisation of load SVE intrinsics
when predicate is zero.
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 70 | ||||
-rw-r--r-- | llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll | 395 |
2 files changed, 465 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 632cb23..0f5d80a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) { m_ConstantInt<AArch64SVEPredPattern::all>())); } +// Simplify unary operation where predicate has all inactive lanes by replacing +// instruction with zeroed object +static std::optional<Instruction *> +instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) { + if (match(II.getOperand(0), m_ZeroInt())) { + Constant *Node; + Type *RetTy = II.getType(); + if (RetTy->isStructTy()) { + auto StructT = cast<StructType>(RetTy); + auto VecT = StructT->getElementType(0); + SmallVector<llvm::Constant *, 4> ZerVec; + for (unsigned i = 0; i < StructT->getNumElements(); i++) { + ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) + : ConstantInt::get(VecT, 0)); + } + Node = ConstantStruct::get(StructT, ZerVec); + } else if (RetTy->isFPOrFPVectorTy()) + Node = ConstantFP::get(RetTy, 0.0); + else + Node = ConstantInt::get(II.getType(), 0); + + IC.replaceInstUsesWith(II, Node); + return IC.eraseInstFromFunction(II); + } + return std::nullopt; +} + static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, IntrinsicInst &II) { // svsel(ptrue, x, y) => x @@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Value *PtrOp = II.getOperand(1); Type *VecTy = II.getType(); + // Replace by zero constant when all lanes are inactive + if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) + return II_NA; + if (isAllActivePredicate(Pred)) { LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); Load->copyMetadata(II); @@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { Type *Ty = II.getType(); Value *PassThru = ConstantAggregateZero::get(Ty); + // Replace by zero constant when all lanes are inactive + if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) + return II_NA; + // Contiguous gather => masked load. // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) @@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, switch (IID) { default: break; + + case Intrinsic::aarch64_sve_ld1_gather: + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1_gather_sxtw: + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ld1_gather_uxtw: + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ld1q_gather_index: + case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: + case Intrinsic::aarch64_sve_ld1ro: + case Intrinsic::aarch64_sve_ld1rq: + case Intrinsic::aarch64_sve_ld1udq: + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld2q_sret: + case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld3q_sret: + case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld4q_sret: + case Intrinsic::aarch64_sve_ldff1: + case Intrinsic::aarch64_sve_ldff1_gather: + case Intrinsic::aarch64_sve_ldff1_gather_index: + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ldnf1: + case Intrinsic::aarch64_sve_ldnt1: + case Intrinsic::aarch64_sve_ldnt1_gather: + case Intrinsic::aarch64_sve_ldnt1_gather_index: + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return instCombineSVENoActiveUnaryZero(IC, II); case Intrinsic::aarch64_neon_fmaxnm: case Intrinsic::aarch64_neon_fminnm: return instCombineMaxMinNM(IC, II); diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll new file mode 100644 index 0000000..2470337 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll @@ -0,0 +1,395 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + +define <vscale x 16 x i8> @test_ld1(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer +; +entry: + %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a) + ret <vscale x 16 x i8> %res +} + +define <vscale x 4 x i32> @test_ld1_gather(ptr %a, <vscale x 4 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 2 x i64> @test_ld1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1_gather_index( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer +; +entry: + %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %0 +} + +define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_scalar_offset( +; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ld1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw_index( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ld1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw_index( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + + +define <vscale x 2 x i64> @test_ld1q_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1q_gather_index( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer +; +entry: + %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %0 +} + +define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(<vscale x 2 x i64> %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset( +; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %a, i64 0) + ret <vscale x 8 x i16> %0 +} + +define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(ptr %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1q_gather_vector_offset( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer +; +entry: + %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b) + ret <vscale x 16 x i8> %0 +} + +define <vscale x 16 x i8> @test_ld1ro(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1ro( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer +; +entry: + %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a) + ret <vscale x 16 x i8> %res +} + +define <vscale x 16 x i8> @test_ld1rq(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1rq( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer +; +entry: + %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a) + ret <vscale x 16 x i8> %res +} + +define <vscale x 2 x i64> @test_ld1udq(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1udq( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer +; +entry: + %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a) + ret <vscale x 2 x i64> %res +} + +define <vscale x 4 x i32> @test_ld1uwq(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1uwq( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> zeroinitializer, ptr %a) + ret <vscale x 4 x i32> %res +} + +define <vscale x 8 x i16> @test_ld2_sret(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2_sret( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ld2q_sret(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2q_sret( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ld3(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ld3q(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3q( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ld4(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ld4q(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4q( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0 + ret <vscale x 8 x i16> %1 +} + +define <vscale x 8 x i16> @test_ldff1(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ldff1( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + ret <vscale x 8 x i16> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather(ptr %a, <vscale x 4 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 2 x i64> @test_ldff1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 2 x i64> @test_ldff1_gather_index( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer +; +entry: + %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset( +; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 16 x i8> @test_ldnf1(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 16 x i8> @test_ldnf1( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer +; +entry: + %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a) + ret <vscale x 16 x i8> %0 +} + +define <vscale x 8 x i16> @test_ldnt1(ptr %a) #0 { +; CHECK-LABEL: define <vscale x 8 x i16> @test_ldnt1( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer +; +entry: + %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a) + ret <vscale x 8 x i16> %0 +} + +define <vscale x 4 x i32> @test_ldnt1_gather(ptr %a, <vscale x 4 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 2 x i64> @test_ldnt1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 { +; CHECK-LABEL: define <vscale x 2 x i64> @test_ldnt1_gather_index( +; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer +; +entry: + %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b) + ret <vscale x 2 x i64> %0 +} + +define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset( +; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0) + ret <vscale x 4 x i32> %0 +} + +define <vscale x 4 x i32> @test_ldnt1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 { +; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_uxtw( +; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; +entry: + %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) + ret <vscale x 4 x i32> %0 +} |