aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukacma <Marian.Lukac@arm.com>2024-06-25 10:58:16 +0200
committerGitHub <noreply@github.com>2024-06-25 10:58:16 +0200
commit0bd9c49a295829ed44e838c4d54cc905662a1afa (patch)
tree05899f7b4a0d4f9ebbeb96249069f369e26dc90f
parentf6aa50873463ebd9a459b7ccd4989460175a6e7f (diff)
downloadllvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.zip
llvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.tar.gz
llvm-0bd9c49a295829ed44e838c4d54cc905662a1afa.tar.bz2
[AArch64][SVE] optimisation for SVE load intrinsics with no active lanes (#95269)
This patch extends #73964 and adds optimisation of load SVE intrinsics when predicate is zero.
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp70
-rw-r--r--llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll395
2 files changed, 465 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 632cb23..0f5d80a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
+// Simplify unary operation where predicate has all inactive lanes by replacing
+// instruction with zeroed object
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
+ if (match(II.getOperand(0), m_ZeroInt())) {
+ Constant *Node;
+ Type *RetTy = II.getType();
+ if (RetTy->isStructTy()) {
+ auto StructT = cast<StructType>(RetTy);
+ auto VecT = StructT->getElementType(0);
+ SmallVector<llvm::Constant *, 4> ZerVec;
+ for (unsigned i = 0; i < StructT->getNumElements(); i++) {
+ ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
+ : ConstantInt::get(VecT, 0));
+ }
+ Node = ConstantStruct::get(StructT, ZerVec);
+ } else if (RetTy->isFPOrFPVectorTy())
+ Node = ConstantFP::get(RetTy, 0.0);
+ else
+ Node = ConstantInt::get(II.getType(), 0);
+
+ IC.replaceInstUsesWith(II, Node);
+ return IC.eraseInstFromFunction(II);
+ }
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
// svsel(ptrue, x, y) => x
@@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *PtrOp = II.getOperand(1);
Type *VecTy = II.getType();
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
if (isAllActivePredicate(Pred)) {
LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
Load->copyMetadata(II);
@@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Type *Ty = II.getType();
Value *PassThru = ConstantAggregateZero::get(Ty);
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
// Contiguous gather => masked load.
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
@@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+
+ case Intrinsic::aarch64_sve_ld1_gather:
+ case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
+ case Intrinsic::aarch64_sve_ld1ro:
+ case Intrinsic::aarch64_sve_ld1rq:
+ case Intrinsic::aarch64_sve_ld1udq:
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_ld2_sret:
+ case Intrinsic::aarch64_sve_ld2q_sret:
+ case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
+ case Intrinsic::aarch64_sve_ld4_sret:
+ case Intrinsic::aarch64_sve_ld4q_sret:
+ case Intrinsic::aarch64_sve_ldff1:
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ldnf1:
+ case Intrinsic::aarch64_sve_ldnt1:
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ case Intrinsic::aarch64_sve_ldnt1_gather_index:
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return instCombineSVENoActiveUnaryZero(IC, II);
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
new file mode 100644
index 0000000..2470337
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @test_ld1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 4 x i32> @test_ld1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ld1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+
+define <vscale x 2 x i64> @test_ld1q_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1q_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %a, i64 0)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_ld1ro(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1ro(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 16 x i8> @test_ld1rq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1rq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 2 x i64> @test_ld1udq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1udq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i32> @test_ld1uwq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1uwq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 8 x i16> @test_ld2_sret(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld2q_sret(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2q_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3q(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4q(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ldff1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldff1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldff1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldff1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 16 x i8> @test_ldnf1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ldnf1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_ldnt1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldnt1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldnt1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldnt1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}