; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize \ ; RUN: -scalable-vectorization=on -mattr=+sve2 \ ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,SVE ; RUN: opt -passes=loop-vectorize \ ; RUN: -scalable-vectorization=off -mattr=+neon,+dotprod \ ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,NEON ; COMMON: LV: Checking a loop in 'sub_reduction' ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) ; COMMON: LV: Checking a loop in 'add_sub_chained_reduction' ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) ; SVE: Cost of 9 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) ; NEON: Cost of 9 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) target triple = "aarch64" ; Test the cost of a SUB reduction, where the SUB is implemented outside the loop ; and therefore not part of the partial reduction. define i32 @sub_reduction(ptr %arr1, ptr %arr2, i32 %init, i32 %n) #0 { entry: br label %loop loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %acc = phi i32 [ %init, %entry ], [ %sub, %loop ] %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv %load1 = load i8, ptr %gep1 %sext1 = sext i8 %load1 to i32 %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv %load2 = load i8, ptr %gep2 %sext2 = sext i8 %load2 to i32 %mul = mul i32 %sext1, %sext2 %sub = sub i32 %acc, %mul %iv.next = add i32 %iv, 1 %cmp = icmp ult i32 %iv.next, %n br i1 %cmp, label %loop, label %exit, !llvm.loop !0 exit: ret i32 %sub } ; Test that the cost of a SUB that is part of an ADD-SUB reduction chain ; is high, because the negation happens inside the loop and cannot be ; folded into the SDOT instruction (because of the extend). define i32 @add_sub_chained_reduction(ptr %arr1, ptr %arr2, ptr %arr3, i32 %init, i32 %n) #0 { entry: br label %loop loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %acc = phi i32 [ %init, %entry ], [ %sub, %loop ] %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv %load1 = load i8, ptr %gep1 %sext1 = sext i8 %load1 to i32 %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv %load2 = load i8, ptr %gep2 %sext2 = sext i8 %load2 to i32 %mul1 = mul i32 %sext1, %sext2 %add = add i32 %acc, %mul1 %gep3 = getelementptr inbounds i8, ptr %arr3, i32 %iv %load3 = load i8, ptr %gep3 %sext3 = sext i8 %load3 to i32 %mul2 = mul i32 %sext2, %sext3 %sub = sub i32 %add, %mul2 %iv.next = add i32 %iv, 1 %cmp = icmp ult i32 %iv.next, %n br i1 %cmp, label %loop, label %exit, !llvm.loop !0 exit: ret i32 %sub } attributes #0 = { vscale_range(1,16) } !0 = distinct !{!0, !1, !2} !1 = !{!"llvm.loop.interleave.count", i32 1} !2 = !{!"llvm.loop.vectorize.width", i32 16}