; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple aarch64 -mattr=+sve2 -o - %s | FileCheck %s define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float noundef nofpclass(nan inf) %kernel_factor, ptr %call5.i.i.i119) vscale_range(1, 16) { ; CHECK-LABEL: test: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w1, #1 ; CHECK-NEXT: b.lt .LBB0_6 ; CHECK-NEXT: // %bb.1: // %for.body.lr.ph ; CHECK-NEXT: rdvl x8, #-2 ; CHECK-NEXT: mov w9, #608 // =0x260 ; CHECK-NEXT: ands x11, x8, x9 ; CHECK-NEXT: b.eq .LBB0_6 ; CHECK-NEXT: // %bb.2: // %for.body.us.preheader ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x11, x2, x11, lsl #1 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x12, #4 // =0x4 ; CHECK-NEXT: mov x13, #8 // =0x8 ; CHECK-NEXT: .LBB0_3: // %for.body.us ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_4 Depth 2 ; CHECK-NEXT: add x14, x0, x9, lsl #2 ; CHECK-NEXT: sbfiz x15, x8, #1, #32 ; CHECK-NEXT: mov x16, x2 ; CHECK-NEXT: ldp s0, s1, [x14] ; CHECK-NEXT: add x15, x15, #8 ; CHECK-NEXT: ldp s2, s3, [x14, #8] ; CHECK-NEXT: ubfiz x14, x8, #1, #32 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: fcvt h1, s1 ; CHECK-NEXT: fcvt h2, s2 ; CHECK-NEXT: fcvt h3, s3 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: mov z1.h, h1 ; CHECK-NEXT: mov z2.h, h2 ; CHECK-NEXT: mov z3.h, h3 ; CHECK-NEXT: .LBB0_4: // %for.cond.i.preheader.us ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x16, x14] ; CHECK-NEXT: ldr z5, [x16] ; CHECK-NEXT: add x17, x16, x15 ; CHECK-NEXT: add x18, x16, x14 ; CHECK-NEXT: add x3, x17, #8 ; CHECK-NEXT: add x4, x17, #16 ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x16, x15] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x12, lsl #1] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x13, lsl #1] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h ; CHECK-NEXT: ldr z5, [x16, #1, mul vl] ; CHECK-NEXT: str z4, [x16] ; CHECK-NEXT: ldr z4, [x18, #1, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h ; CHECK-NEXT: ldr z5, [x17, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h ; CHECK-NEXT: ldr z5, [x3, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h ; CHECK-NEXT: ldr z5, [x4, #1, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h ; CHECK-NEXT: ldr z5, [x16, #2, mul vl] ; CHECK-NEXT: str z4, [x16, #1, mul vl] ; CHECK-NEXT: ldr z4, [x18, #2, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h ; CHECK-NEXT: ldr z5, [x17, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h ; CHECK-NEXT: ldr z5, [x3, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h ; CHECK-NEXT: ldr z5, [x4, #2, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h ; CHECK-NEXT: ldr z5, [x16, #3, mul vl] ; CHECK-NEXT: str z4, [x16, #2, mul vl] ; CHECK-NEXT: ldr z4, [x18, #3, mul vl] ; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h ; CHECK-NEXT: ldr z5, [x17, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h ; CHECK-NEXT: ldr z5, [x3, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h ; CHECK-NEXT: ldr z5, [x4, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h ; CHECK-NEXT: str z4, [x16, #3, mul vl] ; CHECK-NEXT: incb x16, all, mul #4 ; CHECK-NEXT: cmp x16, x11 ; CHECK-NEXT: b.lo .LBB0_4 ; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: add w10, w10, #1 ; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: add w8, w8, #16 ; CHECK-NEXT: cmp w10, w1 ; CHECK-NEXT: b.ne .LBB0_3 ; CHECK-NEXT: .LBB0_6: // %exit78 ; CHECK-NEXT: ret entry: ;%call5.i.i.i119 = tail call noalias noundef nonnull dereferenceable(1248) ptr @_Znwm(i64 noundef 1248) #7 %cmp139 = icmp sgt i32 %kw, 0 ;tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(1248) %call5.i.i.i119, i8 0, i64 1248, i1 false) br i1 %cmp139, label %for.body.lr.ph, label %exit78 for.body.lr.ph: ; preds = %entry %0 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) %vscale = tail call i64 @llvm.vscale.i64() %mul5.i = shl nuw nsw i64 %vscale, 5 %sub.not.i = sub nsw i64 0, %mul5.i %sub6.i = and i64 %sub.not.i, 608 %add.ptr.i = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %sub6.i %cmp.i133.not = icmp eq i64 %sub6.i, 0 %vs2 = shl nuw nsw i64 %vscale, 4 br i1 %cmp.i133.not, label %exit78, label %for.body.us.preheader for.body.us.preheader: ; preds = %for.body.lr.ph %.idx.i.us.2 = shl nuw nsw i64 %vscale, 5 %.idx.i.us.3 = mul nuw nsw i64 %vscale, 48 br label %for.body.us for.body.us: ; preds = %for.body.us.preheader, %while.cond.i..exit_crit_edge.us %indvars.iv = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next, %while.cond.i..exit_crit_edge.us ] %i4.0140.us = phi i32 [ 0, %for.body.us.preheader ], [ %inc.us, %while.cond.i..exit_crit_edge.us ] %3 = trunc nuw nsw i64 %indvars.iv to i32 %mul6.us = shl i32 %3, 2 %idx.ext.us = zext nneg i32 %mul6.us to i64 %add.ptr.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext.us %mul11.us = or disjoint i32 %mul6.us, 4 %idx.ext12.us = sext i32 %mul11.us to i64 %add.ptr13.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext12.us %mul18.us = or disjoint i32 %mul6.us, 8 %idx.ext19.us = sext i32 %mul18.us to i64 %add.ptr20.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext19.us %mul25.us = or disjoint i32 %mul6.us, 12 %idx.ext26.us = sext i32 %mul25.us to i64 %add.ptr27.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext26.us %add.ptr29.us = getelementptr inbounds float, ptr %kernel, i64 %indvars.iv %4 = load float, ptr %add.ptr29.us, align 4 %5 = fptrunc float %4 to half %.splatinsert.i.us = insertelement poison, half %5, i64 0 %6 = shufflevector %.splatinsert.i.us, poison, zeroinitializer %arrayidx2.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 4 %7 = load float, ptr %arrayidx2.i.us, align 4 %8 = fptrunc float %7 to half %.splatinsert57.i.us = insertelement poison, half %8, i64 0 %9 = shufflevector %.splatinsert57.i.us, poison, zeroinitializer %arrayidx3.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 8 %10 = load float, ptr %arrayidx3.i.us, align 4 %11 = fptrunc float %10 to half %.splatinsert58.i.us = insertelement poison, half %11, i64 0 %12 = shufflevector %.splatinsert58.i.us, poison, zeroinitializer %arrayidx4.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 12 %13 = load float, ptr %arrayidx4.i.us, align 4 %14 = fptrunc float %13 to half %.splatinsert59.i.us = insertelement poison, half %14, i64 0 %15 = shufflevector %.splatinsert59.i.us, poison, zeroinitializer br label %for.cond.i.preheader.us for.cond.i.preheader.us: ; preds = %for.body.us, %for.cond.i.preheader.us %vdst.0.i138.us = phi ptr [ %call5.i.i.i119, %for.body.us ], [ %add.ptr15.i.us, %for.cond.i.preheader.us ] %s1.0.i137.us = phi ptr [ %add.ptr.us, %for.body.us ], [ %add.ptr16.i.us, %for.cond.i.preheader.us ] %s2.0.i136.us = phi ptr [ %add.ptr13.us, %for.body.us ], [ %add.ptr17.i.us, %for.cond.i.preheader.us ] %s3.0.i135.us = phi ptr [ %add.ptr20.us, %for.body.us ], [ %add.ptr18.i.us, %for.cond.i.preheader.us ] %s4.0.i134.us = phi ptr [ %add.ptr27.us, %for.body.us ], [ %add.ptr19.i.us, %for.cond.i.preheader.us ] %16 = load , ptr %s1.0.i137.us, align 16 %17 = load , ptr %vdst.0.i138.us, align 16 %18 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %17, %16, %6) %19 = load , ptr %s2.0.i136.us, align 16 %20 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %18, %19, %9) %21 = load , ptr %s3.0.i135.us, align 16 %22 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %20, %21, %12) %23 = load , ptr %s4.0.i134.us, align 16 %24 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %22, %23, %15) store %24, ptr %vdst.0.i138.us, align 16 %25 = getelementptr i8, ptr %s1.0.i137.us, i64 %vs2 %26 = load , ptr %25, align 16 %27 = getelementptr i8, ptr %vdst.0.i138.us, i64 %vs2 %28 = load , ptr %27, align 16 %29 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %28, %26, %6) %30 = getelementptr i8, ptr %s2.0.i136.us, i64 %vs2 %31 = load , ptr %30, align 16 %32 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %29, %31, %9) %33 = getelementptr i8, ptr %s3.0.i135.us, i64 %vs2 %34 = load , ptr %33, align 16 %35 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %32, %34, %12) %36 = getelementptr i8, ptr %s4.0.i134.us, i64 %vs2 %37 = load , ptr %36, align 16 %38 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %35, %37, %15) store %38, ptr %27, align 16 %39 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.2 %40 = load , ptr %39, align 16 %41 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.2 %42 = load , ptr %41, align 16 %43 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %42, %40, %6) %44 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.2 %45 = load , ptr %44, align 16 %46 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %43, %45, %9) %47 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.2 %48 = load , ptr %47, align 16 %49 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %46, %48, %12) %50 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.2 %51 = load , ptr %50, align 16 %52 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %49, %51, %15) store %52, ptr %41, align 16 %53 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.3 %54 = load , ptr %53, align 16 %55 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.3 %56 = load , ptr %55, align 16 %57 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %56, %54, %6) %58 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.3 %59 = load , ptr %58, align 16 %60 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %57, %59, %9) %61 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.3 %62 = load , ptr %61, align 16 %63 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %60, %62, %12) %64 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.3 %65 = load , ptr %64, align 16 %66 = tail call fast @llvm.aarch64.sve.fmla.u.nxv8f16( %0, %63, %65, %15) store %66, ptr %55, align 16 %add.ptr15.i.us = getelementptr inbounds half, ptr %vdst.0.i138.us, i64 %mul5.i %add.ptr16.i.us = getelementptr inbounds half, ptr %s1.0.i137.us, i64 %mul5.i %add.ptr17.i.us = getelementptr inbounds half, ptr %s2.0.i136.us, i64 %mul5.i %add.ptr18.i.us = getelementptr inbounds half, ptr %s3.0.i135.us, i64 %mul5.i %add.ptr19.i.us = getelementptr inbounds half, ptr %s4.0.i134.us, i64 %mul5.i %cmp.i.us = icmp ult ptr %add.ptr15.i.us, %add.ptr.i br i1 %cmp.i.us, label %for.cond.i.preheader.us, label %while.cond.i..exit_crit_edge.us while.cond.i..exit_crit_edge.us: ; preds = %for.cond.i.preheader.us %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 %inc.us = add nuw nsw i32 %i4.0140.us, 1 %exitcond.not = icmp eq i32 %inc.us, %kw br i1 %exitcond.not, label %exit78, label %for.body.us exit78: ; preds = %while.cond.i..exit_crit_edge.us, %for.body.lr.ph, %entry ret void }