; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s %struct.TwoBytes = type { i8, i8 } %struct.FourBytes = type { i8, i8, i8, i8 } %struct.TwoFloats = type { float, float } %struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: mac_2d_f32_i8_fmuladd: ; CHECK-NOT: v128.load define hidden void @mac_2d_f32_i8_fmuladd(ptr dead_on_unwind noalias writable sret(%struct.TwoFloats) align 4 captures(none) %agg.result, ptr noundef readonly captures(none) %x, ptr noundef readonly captures(none) %y, i32 noundef %n) { entry: %agg.result.promoted = load float, ptr %agg.result, align 4 %cmp18.not = icmp eq i32 %n, 0 br i1 %cmp18.not, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: %b10 = getelementptr inbounds nuw i8, ptr %agg.result, i32 4 %b10.promoted = load float, ptr %b10, align 4 br label %for.body for.cond.for.cond.cleanup_crit_edge: store float %7, ptr %b10, align 4 br label %for.cond.cleanup for.cond.cleanup: %.lcssa = phi float [ %4, %for.cond.for.cond.cleanup_crit_edge ], [ %agg.result.promoted, %entry ] store float %.lcssa, ptr %agg.result, align 4 ret void for.body: %0 = phi float [ %b10.promoted, %for.body.lr.ph ], [ %7, %for.body ] %i.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %1 = phi float [ %agg.result.promoted, %for.body.lr.ph ], [ %4, %for.body ] %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %x, i32 %i.019 %2 = load i8, ptr %arrayidx, align 1 %conv = sitofp i8 %2 to float %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %y, i32 %i.019 %3 = load i8, ptr %arrayidx1, align 1 %conv3 = sitofp i8 %3 to float %4 = tail call float @llvm.fmuladd.f32(float %conv, float %conv3, float %1) %b = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 %5 = load i8, ptr %b, align 1 %conv6 = sitofp i8 %5 to float %b8 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 %6 = load i8, ptr %b8, align 1 %conv9 = sitofp i8 %6 to float %7 = tail call float @llvm.fmuladd.f32(float %conv6, float %conv9, float %0) %inc = add nuw i32 %i.019, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body } ; CHECK-LABEL: mac_2d_f32_i8: ; CHECK-NOT: v128.load define hidden void @mac_2d_f32_i8(ptr dead_on_unwind noalias writable sret(%struct.TwoFloats) align 4 captures(none) %agg.result, ptr noundef readonly captures(none) %x, ptr noundef readonly captures(none) %y, i32 noundef %n) { entry: %agg.result.promoted = load float, ptr %agg.result, align 4 %cmp18.not = icmp eq i32 %n, 0 br i1 %cmp18.not, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: %b10 = getelementptr inbounds nuw i8, ptr %agg.result, i32 4 %b10.promoted = load float, ptr %b10, align 4 br label %for.body for.cond.for.cond.cleanup_crit_edge: store float %7, ptr %b10, align 4 br label %for.cond.cleanup for.cond.cleanup: %.lcssa = phi float [ %4, %for.cond.for.cond.cleanup_crit_edge ], [ %agg.result.promoted, %entry ] store float %.lcssa, ptr %agg.result, align 4 ret void for.body: %0 = phi float [ %b10.promoted, %for.body.lr.ph ], [ %7, %for.body ] %i.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %1 = phi float [ %agg.result.promoted, %for.body.lr.ph ], [ %4, %for.body ] %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %x, i32 %i.019 %2 = load i8, ptr %arrayidx, align 1 %conv = sitofp i8 %2 to float %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %y, i32 %i.019 %3 = load i8, ptr %arrayidx1, align 1 %conv3 = sitofp i8 %3 to float %fmul = fmul float %conv, %conv3 %4 = fadd float %fmul, %1 %b = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 %5 = load i8, ptr %b, align 1 %conv6 = sitofp i8 %5 to float %b8 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 %6 = load i8, ptr %b8, align 1 %conv9 = sitofp i8 %6 to float %fmul.1 = fmul float %conv6, %conv9 %7 = fadd float %fmul.1, %0 %inc = add nuw i32 %i.019, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body } declare float @llvm.fmuladd.f32(float, float, float) ; CHECK-LABEL: mac_4d_f32_i8_fmuladd: ; CHECK: loop ; CHECK: v128.load32_zero ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load32_zero ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul ; CHECK: f32x4.add define hidden void @mac_4d_f32_i8_fmuladd(ptr dead_on_unwind noalias writable sret(%struct.FourFloats) align 4 captures(none) %agg.result, ptr noundef readonly captures(none) %x, ptr noundef readonly captures(none) %y, i32 noundef %n) { entry: %agg.result.promoted = load float, ptr %agg.result, align 4 %cmp38.not = icmp eq i32 %n, 0 br i1 %cmp38.not, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: %b10 = getelementptr inbounds nuw i8, ptr %agg.result, i32 4 %c16 = getelementptr inbounds nuw i8, ptr %agg.result, i32 8 %d22 = getelementptr inbounds nuw i8, ptr %agg.result, i32 12 %b10.promoted = load float, ptr %b10, align 4 %c16.promoted = load float, ptr %c16, align 4 %d22.promoted = load float, ptr %d22, align 4 br label %for.body for.cond.for.cond.cleanup_crit_edge: store float %9, ptr %b10, align 4 store float %12, ptr %c16, align 4 store float %15, ptr %d22, align 4 br label %for.cond.cleanup for.cond.cleanup: %.lcssa = phi float [ %6, %for.cond.for.cond.cleanup_crit_edge ], [ %agg.result.promoted, %entry ] store float %.lcssa, ptr %agg.result, align 4 ret void for.body: %0 = phi float [ %d22.promoted, %for.body.lr.ph ], [ %15, %for.body ] %1 = phi float [ %c16.promoted, %for.body.lr.ph ], [ %12, %for.body ] %2 = phi float [ %b10.promoted, %for.body.lr.ph ], [ %9, %for.body ] %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %3 = phi float [ %agg.result.promoted, %for.body.lr.ph ], [ %6, %for.body ] %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %x, i32 %i.039 %4 = load i8, ptr %arrayidx, align 1 %conv = sitofp i8 %4 to float %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %y, i32 %i.039 %5 = load i8, ptr %arrayidx1, align 1 %conv3 = sitofp i8 %5 to float %6 = tail call float @llvm.fmuladd.f32(float %conv, float %conv3, float %3) %b = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 %7 = load i8, ptr %b, align 1 %conv6 = sitofp i8 %7 to float %b8 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 %8 = load i8, ptr %b8, align 1 %conv9 = sitofp i8 %8 to float %9 = tail call float @llvm.fmuladd.f32(float %conv6, float %conv9, float %2) %c = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 %10 = load i8, ptr %c, align 1 %conv12 = sitofp i8 %10 to float %c14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 %11 = load i8, ptr %c14, align 1 %conv15 = sitofp i8 %11 to float %12 = tail call float @llvm.fmuladd.f32(float %conv12, float %conv15, float %1) %d = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 %13 = load i8, ptr %d, align 1 %conv18 = sitofp i8 %13 to float %d20 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 %14 = load i8, ptr %d20, align 1 %conv21 = sitofp i8 %14 to float %15 = tail call float @llvm.fmuladd.f32(float %conv18, float %conv21, float %0) %inc = add nuw i32 %i.039, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body } ; CHECK-LABEL: mac_4d_f32_i8: ; CHECK: loop ; CHECK: v128.load32_zero ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load32_zero ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul ; CHECK: f32x4.add define hidden void @mac_4d_f32_i8(ptr dead_on_unwind noalias writable sret(%struct.FourFloats) align 4 captures(none) %agg.result, ptr noundef readonly captures(none) %x, ptr noundef readonly captures(none) %y, i32 noundef %n) { entry: %agg.result.promoted = load float, ptr %agg.result, align 4 %cmp38.not = icmp eq i32 %n, 0 br i1 %cmp38.not, label %for.cond.cleanup, label %for.body.lr.ph for.body.lr.ph: %b10 = getelementptr inbounds nuw i8, ptr %agg.result, i32 4 %c16 = getelementptr inbounds nuw i8, ptr %agg.result, i32 8 %d22 = getelementptr inbounds nuw i8, ptr %agg.result, i32 12 %b10.promoted = load float, ptr %b10, align 4 %c16.promoted = load float, ptr %c16, align 4 %d22.promoted = load float, ptr %d22, align 4 br label %for.body for.cond.for.cond.cleanup_crit_edge: store float %9, ptr %b10, align 4 store float %12, ptr %c16, align 4 store float %15, ptr %d22, align 4 br label %for.cond.cleanup for.cond.cleanup: %.lcssa = phi float [ %6, %for.cond.for.cond.cleanup_crit_edge ], [ %agg.result.promoted, %entry ] store float %.lcssa, ptr %agg.result, align 4 ret void for.body: %0 = phi float [ %d22.promoted, %for.body.lr.ph ], [ %15, %for.body ] %1 = phi float [ %c16.promoted, %for.body.lr.ph ], [ %12, %for.body ] %2 = phi float [ %b10.promoted, %for.body.lr.ph ], [ %9, %for.body ] %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %3 = phi float [ %agg.result.promoted, %for.body.lr.ph ], [ %6, %for.body ] %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %x, i32 %i.039 %4 = load i8, ptr %arrayidx, align 1 %conv = sitofp i8 %4 to float %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %y, i32 %i.039 %5 = load i8, ptr %arrayidx1, align 1 %conv3 = sitofp i8 %5 to float %fmul = fmul float %conv, %conv3 %6 = fadd float %fmul, %3 %b = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 %7 = load i8, ptr %b, align 1 %conv6 = sitofp i8 %7 to float %b8 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 %8 = load i8, ptr %b8, align 1 %conv9 = sitofp i8 %8 to float %fmul.1 = fmul float %conv6, %conv9 %9 = fadd float %fmul.1, %2 %c = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 %10 = load i8, ptr %c, align 1 %conv12 = sitofp i8 %10 to float %c14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 %11 = load i8, ptr %c14, align 1 %conv15 = sitofp i8 %11 to float %fmul.2 = fmul float %conv12, %conv15 %12 = fadd float %fmul.2, %1 %d = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 %13 = load i8, ptr %d, align 1 %conv18 = sitofp i8 %13 to float %d20 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 %14 = load i8, ptr %d20, align 1 %conv21 = sitofp i8 %14 to float %fmul.3 = fmul float %conv18, %conv21 %15 = fadd float %fmul.3, %0 %inc = add nuw i32 %i.039, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body }