; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTZVE32F ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTV %struct.foo = type { i32, i32, i32, i32 } ; void gather(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i != 1024; ++i) ; A[i] += B[i * 5]; ; } define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a0, 1024 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: li a3, 5 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: .LBB0_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse8.v v8, (a1), a3 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; CHECK-LABEL: gather_masked: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a0, 1024 ; CHECK-NEXT: lui a4, 983765 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: addi a4, a4, 873 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a4 ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB1_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t ; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB1_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> , <32 x i8> %maskedoff) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_negative_stride: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a1, a1, 155 ; CHECK-NEXT: addi a2, a0, 1024 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: li a3, -5 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: .LBB2_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse8.v v8, (a1), a3 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB2_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_zero_stride: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a0, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: .LBB3_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lbu a3, 0(a1) ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vadd.vx v8, v8, a3 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB3_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = add <32 x i8> %wide.load, %wide.masked.gather store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_zero_stride_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a0, 1024 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB4_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lw a3, 0(a1) ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vadd.vx v8, v8, a3 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 8 ; CHECK-NEXT: addi a1, a1, 160 ; CHECK-NEXT: bne a0, a2, .LBB4_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <8 x i32>, ptr %i2, align 4 %i4 = add <8 x i32> %wide.load, %wide.masked.gather store <8 x i32> %i4, ptr %i2, align 4 %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; V-LABEL: gather_zero_stride_unfold: ; V: # %bb.0: # %entry ; V-NEXT: addi a2, a0, 1024 ; V-NEXT: li a3, 32 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; V-NEXT: .LBB5_1: # %vector.body ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: lbu a3, 0(a1) ; V-NEXT: vle8.v v8, (a0) ; V-NEXT: vmv.v.x v9, a3 ; V-NEXT: vdivu.vv v8, v9, v8 ; V-NEXT: vse8.v v8, (a0) ; V-NEXT: addi a0, a0, 32 ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bne a0, a2, .LBB5_1 ; V-NEXT: # %bb.2: # %for.cond.cleanup ; V-NEXT: ret ; ; ZVE32F-LABEL: gather_zero_stride_unfold: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: addi a2, a0, 1024 ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; ZVE32F-NEXT: .LBB5_1: # %vector.body ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: lbu a3, 0(a1) ; ZVE32F-NEXT: vle8.v v8, (a0) ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vdivu.vv v8, v9, v8 ; ZVE32F-NEXT: vse8.v v8, (a0) ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: addi a1, a1, 160 ; ZVE32F-NEXT: bne a0, a2, .LBB5_1 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup ; ZVE32F-NEXT: ret ; ; OPTIMIZED-LABEL: gather_zero_stride_unfold: ; OPTIMIZED: # %bb.0: # %entry ; OPTIMIZED-NEXT: addi a2, a0, 1024 ; OPTIMIZED-NEXT: li a3, 32 ; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; OPTIMIZED-NEXT: .LBB5_1: # %vector.body ; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 ; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero ; OPTIMIZED-NEXT: vle8.v v9, (a0) ; OPTIMIZED-NEXT: vdivu.vv v8, v8, v9 ; OPTIMIZED-NEXT: vse8.v v8, (a0) ; OPTIMIZED-NEXT: addi a0, a0, 32 ; OPTIMIZED-NEXT: addi a1, a1, 160 ; OPTIMIZED-NEXT: bne a0, a2, .LBB5_1 ; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup ; OPTIMIZED-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i2 = getelementptr inbounds i8, ptr %A, i64 %index %wide.load = load <32 x i8>, ptr %i2, align 1 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load store <32 x i8> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void scatter(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i * 5] += B[i]; ;} define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: scatter: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a1, 1024 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: li a3, 5 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: .LBB6_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vlse8.v v9, (a0), a3 ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse8.v v8, (a0), a3 ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB6_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i8, ptr %B, i64 %index %wide.load = load <32 x i8>, ptr %i, align 1 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i4 = add <32 x i8> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true)) %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; CHECK-LABEL: scatter_masked: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a1, 1024 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: lui a4, 983765 ; CHECK-NEXT: addi a4, a4, 873 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a4 ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB7_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v9, (a1) ; CHECK-NEXT: vmv1r.v v10, v8 ; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB7_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i8, ptr %B, i64 %index %wide.load = load <32 x i8>, ptr %i, align 1 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> , <32 x i8> %maskedoff) %i4 = add <32 x i8> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> ) %index.next = add nuw i64 %index, 32 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i != 1024; ++i) ; A[i] += B[i * 4]; ; } define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_pow2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vlse32.v v8, (a1), a2 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: bne a0, a3, .LBB8_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = shl nsw <8 x i64> %vec.ind, splat (i64 2) %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i2 = getelementptr inbounds i32, ptr %A, i64 %index %wide.load = load <8 x i32>, ptr %i2, align 1 %i4 = add <8 x i32> %wide.load, %wide.masked.gather store <8 x i32> %i4, ptr %i2, align 1 %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i * 4] += B[i]; ;} define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: scatter_pow2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: add a3, a1, a3 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB9_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vlse32.v v9, (a0), a4 ; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: bne a1, a3, .LBB9_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = getelementptr inbounds i32, ptr %B, i64 %index %wide.load = load <8 x i32>, ptr %i, align 1 %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i4 = add <8 x i32> %wide.masked.gather, %wide.load call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true)) %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) %i5 = icmp eq i64 %index.next, 1024 br i1 %i5, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;struct foo { ; int a, b, c, d; ;}; ; ;void struct_gather(int * __restrict A, struct foo * __restrict B) { ; for (int i = 0; i < 1024; ++i) ; A[i] += B[i].b; ;} define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: struct_gather: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a1, a1, 132 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB10_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: addi a4, a0, 32 ; CHECK-NEXT: addi a5, a1, -128 ; CHECK-NEXT: vlse32.v v8, (a1), a3 ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vlse32.v v10, (a5), a3 ; CHECK-NEXT: vle32.v v11, (a4) ; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vadd.vv v8, v11, v8 ; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: vse32.v v8, (a4) ; CHECK-NEXT: addi a0, a0, 64 ; CHECK-NEXT: addi a1, a1, 256 ; CHECK-NEXT: bne a0, a2, .LBB10_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %step.add = add <8 x i64> %vec.ind, splat (i64 8) %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i2 = getelementptr inbounds i32, ptr %A, i64 %index %wide.load = load <8 x i32>, ptr %i2, align 4 %i4 = getelementptr inbounds i32, ptr %i2, i64 8 %wide.load10 = load <8 x i32>, ptr %i4, align 4 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 store <8 x i32> %i6, ptr %i2, align 4 store <8 x i32> %i7, ptr %i4, align 4 %index.next = add nuw i64 %index, 16 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16) %i10 = icmp eq i64 %index.next, 1024 br i1 %i10, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } ;void gather_unroll(int * __restrict A, int * __restrict B) { ; for (int i = 0; i < 1024; i+= 4 ) { ; A[i] += B[i * 4]; ; A[i+1] += B[(i+1) * 4]; ; A[i+2] += B[(i+2) * 4]; ; A[i+3] += B[(i+3) * 4]; ; } ;} define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_unroll: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 256 ; CHECK-NEXT: li a3, 64 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB11_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse32.v v8, (a1), a3 ; CHECK-NEXT: vlse32.v v9, (a0), a4 ; CHECK-NEXT: addi a5, a1, 16 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 4 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a5, a1, 32 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 8 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a5, a1, 48 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 12 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: addi a2, a2, -8 ; CHECK-NEXT: addi a1, a1, 512 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: bnez a2, .LBB11_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true)) %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1) %i5 = shl nsw <8 x i64> %i4, splat (i64 2) %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true)) %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2) %i10 = shl nsw <8 x i64> %i9, splat (i64 2) %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true)) %i14 = or disjoint <8 x i64> %vec.ind, %i15 = shl nsw <8 x i64> %i14, %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true)) %index.next = add nuw i64 %index, 8 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) %i19 = icmp eq i64 %index.next, 256 br i1 %i19, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>) declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>) declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>) declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>) ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { ; V-LABEL: gather_of_pointers: ; V: # %bb.0: # %bb ; V-NEXT: lui a2, 2 ; V-NEXT: add a2, a0, a2 ; V-NEXT: li a3, 40 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB12_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: vlse64.v v8, (a1), a3 ; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v9, (a4), a3 ; V-NEXT: addi a4, a0, 16 ; V-NEXT: vse64.v v8, (a0) ; V-NEXT: addi a0, a0, 32 ; V-NEXT: vse64.v v9, (a4) ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bne a0, a2, .LBB12_1 ; V-NEXT: # %bb.2: # %bb18 ; V-NEXT: ret ; ; ZVE32F-LABEL: gather_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 ; ZVE32F-NEXT: lui a3, 2 ; ZVE32F-NEXT: add a3, a0, a3 ; ZVE32F-NEXT: li a4, 1 ; ZVE32F-NEXT: .LBB12_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: slli a5, a4, 3 ; ZVE32F-NEXT: slli a6, a4, 5 ; ZVE32F-NEXT: slli a7, a2, 3 ; ZVE32F-NEXT: slli t0, a2, 5 ; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: add a5, a6, a5 ; ZVE32F-NEXT: add a7, t0, a7 ; ZVE32F-NEXT: add a5, a1, a5 ; ZVE32F-NEXT: add a7, a1, a7 ; ZVE32F-NEXT: ld a6, 0(a7) ; ZVE32F-NEXT: ld t0, 0(a5) ; ZVE32F-NEXT: ld a7, 80(a7) ; ZVE32F-NEXT: ld a5, 80(a5) ; ZVE32F-NEXT: sd a6, 0(a0) ; ZVE32F-NEXT: sd t0, 8(a0) ; ZVE32F-NEXT: sd a7, 16(a0) ; ZVE32F-NEXT: sd a5, 24(a0) ; ZVE32F-NEXT: addi a0, a0, 32 ; ZVE32F-NEXT: addi a4, a4, 4 ; ZVE32F-NEXT: bne a0, a3, .LBB12_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret ; ; OPTZVE32F-LABEL: gather_of_pointers: ; OPTZVE32F: # %bb.0: # %bb ; OPTZVE32F-NEXT: lui a2, 2 ; OPTZVE32F-NEXT: add a2, a0, a2 ; OPTZVE32F-NEXT: li a3, 40 ; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; OPTZVE32F-NEXT: .LBB12_1: # %bb2 ; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; OPTZVE32F-NEXT: vlse64.v v8, (a1), a3 ; OPTZVE32F-NEXT: addi a4, a1, 80 ; OPTZVE32F-NEXT: vlse64.v v9, (a4), a3 ; OPTZVE32F-NEXT: addi a4, a0, 16 ; OPTZVE32F-NEXT: vse64.v v8, (a0) ; OPTZVE32F-NEXT: addi a0, a0, 32 ; OPTZVE32F-NEXT: vse64.v v9, (a4) ; OPTZVE32F-NEXT: addi a1, a1, 160 ; OPTZVE32F-NEXT: bne a0, a2, .LBB12_1 ; OPTZVE32F-NEXT: # %bb.2: # %bb18 ; OPTZVE32F-NEXT: ret ; ; OPTV-LABEL: gather_of_pointers: ; OPTV: # %bb.0: # %bb ; OPTV-NEXT: li a2, 0 ; OPTV-NEXT: lui a3, 2 ; OPTV-NEXT: add a3, a0, a3 ; OPTV-NEXT: li a4, 1 ; OPTV-NEXT: .LBB12_1: # %bb2 ; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 ; OPTV-NEXT: slli a5, a4, 3 ; OPTV-NEXT: slli a6, a4, 5 ; OPTV-NEXT: slli a7, a2, 3 ; OPTV-NEXT: slli t0, a2, 5 ; OPTV-NEXT: addi a2, a2, 4 ; OPTV-NEXT: add a5, a6, a5 ; OPTV-NEXT: add a7, t0, a7 ; OPTV-NEXT: add a5, a1, a5 ; OPTV-NEXT: add a7, a1, a7 ; OPTV-NEXT: ld a6, 0(a7) ; OPTV-NEXT: ld t0, 0(a5) ; OPTV-NEXT: ld a7, 80(a7) ; OPTV-NEXT: ld a5, 80(a5) ; OPTV-NEXT: sd a6, 0(a0) ; OPTV-NEXT: sd t0, 8(a0) ; OPTV-NEXT: sd a7, 16(a0) ; OPTV-NEXT: sd a5, 24(a0) ; OPTV-NEXT: addi a0, a0, 32 ; OPTV-NEXT: addi a4, a4, 4 ; OPTV-NEXT: bne a0, a3, .LBB12_1 ; OPTV-NEXT: # %bb.2: # %bb18 ; OPTV-NEXT: ret bb: br label %bb2 bb2: ; preds = %bb2, %bb %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] %i3 = phi <2 x i64> [ , %bb ], [ %i16, %bb2 ] %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5) %i5 = mul <2 x i64> %i3, splat (i64 5) %i6 = add <2 x i64> %i5, %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison) %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison) %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i store <2 x ptr> %i9, ptr %i11, align 8 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2 store <2 x ptr> %i10, ptr %i13, align 8 %i15 = add nuw i64 %i, 4 %i16 = add <2 x i64> %i3, %i17 = icmp eq i64 %i15, 1024 br i1 %i17, label %bb18, label %bb2 bb18: ; preds = %bb2 ret void } declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>) ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { ; V-LABEL: scatter_of_pointers: ; V: # %bb.0: # %bb ; V-NEXT: lui a2, 2 ; V-NEXT: add a2, a1, a2 ; V-NEXT: li a3, 40 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB13_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 ; V-NEXT: addi a4, a1, 16 ; V-NEXT: vle64.v v8, (a1) ; V-NEXT: vle64.v v9, (a4) ; V-NEXT: addi a4, a0, 80 ; V-NEXT: addi a1, a1, 32 ; V-NEXT: vsse64.v v8, (a0), a3 ; V-NEXT: vsse64.v v9, (a4), a3 ; V-NEXT: addi a0, a0, 160 ; V-NEXT: bne a1, a2, .LBB13_1 ; V-NEXT: # %bb.2: # %bb18 ; V-NEXT: ret ; ; ZVE32F-LABEL: scatter_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 ; ZVE32F-NEXT: lui a3, 2 ; ZVE32F-NEXT: add a3, a1, a3 ; ZVE32F-NEXT: li a4, 1 ; ZVE32F-NEXT: .LBB13_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; ZVE32F-NEXT: ld a5, 0(a1) ; ZVE32F-NEXT: ld a6, 8(a1) ; ZVE32F-NEXT: ld a7, 16(a1) ; ZVE32F-NEXT: ld t0, 24(a1) ; ZVE32F-NEXT: slli t1, a4, 3 ; ZVE32F-NEXT: slli t2, a4, 5 ; ZVE32F-NEXT: slli t3, a2, 3 ; ZVE32F-NEXT: add t1, t2, t1 ; ZVE32F-NEXT: slli t2, a2, 5 ; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: addi a1, a1, 32 ; ZVE32F-NEXT: add t2, t2, t3 ; ZVE32F-NEXT: add t1, a0, t1 ; ZVE32F-NEXT: add t2, a0, t2 ; ZVE32F-NEXT: sd a5, 0(t2) ; ZVE32F-NEXT: sd a6, 0(t1) ; ZVE32F-NEXT: sd a7, 80(t2) ; ZVE32F-NEXT: sd t0, 80(t1) ; ZVE32F-NEXT: addi a4, a4, 4 ; ZVE32F-NEXT: bne a1, a3, .LBB13_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret ; ; OPTZVE32F-LABEL: scatter_of_pointers: ; OPTZVE32F: # %bb.0: # %bb ; OPTZVE32F-NEXT: lui a2, 2 ; OPTZVE32F-NEXT: add a2, a1, a2 ; OPTZVE32F-NEXT: li a3, 40 ; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; OPTZVE32F-NEXT: .LBB13_1: # %bb2 ; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 ; OPTZVE32F-NEXT: addi a4, a1, 16 ; OPTZVE32F-NEXT: vle64.v v8, (a1) ; OPTZVE32F-NEXT: vle64.v v9, (a4) ; OPTZVE32F-NEXT: addi a4, a0, 80 ; OPTZVE32F-NEXT: addi a1, a1, 32 ; OPTZVE32F-NEXT: vsse64.v v8, (a0), a3 ; OPTZVE32F-NEXT: vsse64.v v9, (a4), a3 ; OPTZVE32F-NEXT: addi a0, a0, 160 ; OPTZVE32F-NEXT: bne a1, a2, .LBB13_1 ; OPTZVE32F-NEXT: # %bb.2: # %bb18 ; OPTZVE32F-NEXT: ret ; ; OPTV-LABEL: scatter_of_pointers: ; OPTV: # %bb.0: # %bb ; OPTV-NEXT: li a2, 0 ; OPTV-NEXT: lui a3, 2 ; OPTV-NEXT: add a3, a1, a3 ; OPTV-NEXT: li a4, 1 ; OPTV-NEXT: .LBB13_1: # %bb2 ; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 ; OPTV-NEXT: ld a5, 0(a1) ; OPTV-NEXT: ld a6, 8(a1) ; OPTV-NEXT: ld a7, 16(a1) ; OPTV-NEXT: ld t0, 24(a1) ; OPTV-NEXT: slli t1, a4, 3 ; OPTV-NEXT: slli t2, a4, 5 ; OPTV-NEXT: slli t3, a2, 3 ; OPTV-NEXT: add t1, t2, t1 ; OPTV-NEXT: slli t2, a2, 5 ; OPTV-NEXT: addi a2, a2, 4 ; OPTV-NEXT: addi a1, a1, 32 ; OPTV-NEXT: add t2, t2, t3 ; OPTV-NEXT: add t1, a0, t1 ; OPTV-NEXT: add t2, a0, t2 ; OPTV-NEXT: sd a5, 0(t2) ; OPTV-NEXT: sd a6, 0(t1) ; OPTV-NEXT: sd a7, 80(t2) ; OPTV-NEXT: sd t0, 80(t1) ; OPTV-NEXT: addi a4, a4, 4 ; OPTV-NEXT: bne a1, a3, .LBB13_1 ; OPTV-NEXT: # %bb.2: # %bb18 ; OPTV-NEXT: ret bb: br label %bb2 bb2: ; preds = %bb2, %bb %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] %i3 = phi <2 x i64> [ , %bb ], [ %i16, %bb2 ] %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i %i6 = load <2 x ptr>, ptr %i4, align 8 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2 %i9 = load <2 x ptr>, ptr %i7, align 8 %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5) %i11 = mul <2 x i64> %i3, splat (i64 5) %i12 = add <2 x i64> %i11, %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true)) call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true)) %i15 = add nuw i64 %i, 4 %i16 = add <2 x i64> %i3, %i17 = icmp eq i64 %i15, 1024 br i1 %i17, label %bb18, label %bb2 bb18: ; preds = %bb2 ret void } declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>) define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) { ; CHECK-LABEL: strided_load_startval_add_with_splat: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: li a3, 1024 ; CHECK-NEXT: beq a2, a3, .LBB14_7 ; CHECK-NEXT: # %bb.1: # %bb3 ; CHECK-NEXT: li a3, 1023 ; CHECK-NEXT: subw a5, a3, a2 ; CHECK-NEXT: li a6, 31 ; CHECK-NEXT: mv a4, a2 ; CHECK-NEXT: bltu a5, a6, .LBB14_5 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: slli a4, a5, 32 ; CHECK-NEXT: slli t0, a2, 2 ; CHECK-NEXT: add a5, a0, a2 ; CHECK-NEXT: add a6, a1, a2 ; CHECK-NEXT: li t2, 32 ; CHECK-NEXT: srli a4, a4, 32 ; CHECK-NEXT: add t0, a6, t0 ; CHECK-NEXT: addi a6, a4, 1 ; CHECK-NEXT: andi a7, a6, -32 ; CHECK-NEXT: add a4, a7, a2 ; CHECK-NEXT: add a2, a0, a4 ; CHECK-NEXT: li t1, 5 ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma ; CHECK-NEXT: .LBB14_3: # %bb15 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse8.v v8, (t0), t1 ; CHECK-NEXT: vle8.v v9, (a5) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a5) ; CHECK-NEXT: addi a5, a5, 32 ; CHECK-NEXT: addi t0, t0, 160 ; CHECK-NEXT: bne a5, a2, .LBB14_3 ; CHECK-NEXT: # %bb.4: # %bb30 ; CHECK-NEXT: beq a6, a7, .LBB14_7 ; CHECK-NEXT: .LBB14_5: # %bb32 ; CHECK-NEXT: add a2, a0, a4 ; CHECK-NEXT: slli a5, a4, 2 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: srli a3, a3, 32 ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: .LBB14_6: # %bb35 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lbu a3, 0(a1) ; CHECK-NEXT: lbu a4, 0(a2) ; CHECK-NEXT: add a3, a4, a3 ; CHECK-NEXT: sb a3, 0(a2) ; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a1, a1, 5 ; CHECK-NEXT: bne a2, a0, .LBB14_6 ; CHECK-NEXT: .LBB14_7: # %bb34 ; CHECK-NEXT: ret bb: %i = icmp eq i32 %arg2, 1024 br i1 %i, label %bb34, label %bb3 bb3: ; preds = %bb %i4 = sext i32 %arg2 to i64 %i5 = sub i32 1023, %arg2 %i6 = zext i32 %i5 to i64 %i7 = add nuw nsw i64 %i6, 1 %i8 = icmp ult i32 %i5, 31 br i1 %i8, label %bb32, label %bb9 bb9: ; preds = %bb3 %i10 = and i64 %i7, 8589934560 %i11 = add nsw i64 %i10, %i4 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer %i14 = add <32 x i64> %i13, br label %bb15 bb15: ; preds = %bb15, %bb9 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ] %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ] %i18 = add i64 %i16, %i4 %i19 = mul nsw <32 x i64> %i17, splat (i64 5) %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> poison) %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18 %i24 = load <32 x i8>, ptr %i22, align 1 %i25 = add <32 x i8> %i24, %i21 store <32 x i8> %i25, ptr %i22, align 1 %i27 = add nuw i64 %i16, 32 %i28 = add <32 x i64> %i17, splat (i64 32) %i29 = icmp eq i64 %i27, %i10 br i1 %i29, label %bb30, label %bb15 bb30: ; preds = %bb15 %i31 = icmp eq i64 %i7, %i10 br i1 %i31, label %bb34, label %bb32 bb32: ; preds = %bb30, %bb3 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ] br label %bb35 bb34: ; preds = %bb35, %bb30, %bb ret void bb35: ; preds = %bb35, %bb32 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ] %i37 = mul nsw i64 %i36, 5 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37 %i39 = load i8, ptr %i38, align 1 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36 %i41 = load i8, ptr %i40, align 1 %i42 = add i8 %i41, %i39 store i8 %i42, ptr %i40, align 1 %i43 = add nsw i64 %i36, 1 %i44 = trunc i64 %i43 to i32 %i45 = icmp eq i32 %i44, 1024 br i1 %i45, label %bb34, label %bb35 } declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>) define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { ; CHECK-LABEL: gather_no_scalar_remainder: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: beqz a2, .LBB15_3 ; CHECK-NEXT: # %bb.1: # %bb2 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: li a3, 5 ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma ; CHECK-NEXT: .LBB15_2: # %bb4 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse8.v v8, (a1), a3 ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: addi a1, a1, 80 ; CHECK-NEXT: bne a0, a2, .LBB15_2 ; CHECK-NEXT: .LBB15_3: # %bb16 ; CHECK-NEXT: ret bb: %i = shl i64 %arg2, 4 %i3 = icmp eq i64 %i, 0 br i1 %i3, label %bb16, label %bb2 bb2: ; preds = %bb br label %bb4 bb4: ; preds = %bb4, %bb2 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] %i6 = phi <16 x i64> [ %i14, %bb4 ], [ , %bb2 ] %i7 = mul <16 x i64> %i6, splat (i64 5) %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> poison) %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5 %i11 = load <16 x i8>, ptr %i10, align 1 %i12 = add <16 x i8> %i11, %i9 store <16 x i8> %i12, ptr %i10, align 1 %i13 = add nuw i64 %i5, 16 %i14 = add <16 x i64> %i6, splat (i64 16) %i15 = icmp eq i64 %i13, %i br i1 %i15, label %bb16, label %bb4 bb16: ; preds = %bb4, %bb ret void } define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_zero_stride_fp: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB16_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a1) ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfadd.vf v8, v8, fa5 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: addi a1, a1, 640 ; CHECK-NEXT: bne a0, a2, .LBB16_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> poison) %i2 = getelementptr inbounds float, ptr %A, i64 %index %wide.load = load <8 x float>, ptr %i2, align 4 %i4 = fadd <8 x float> %wide.load, %wide.masked.gather store <8 x float> %i4, ptr %i2, align 4 %index.next = add nuw i64 %index, 32 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) %i6 = icmp eq i64 %index.next, 1024 br i1 %i6, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void }