; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15

define void @fadd_reductions() {
; Z15-LABEL: 'fadd_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
  ret void
}

define void @fast_fadd_reductions(ptr %src, ptr %dst) {
; Z15-LABEL: 'fast_fadd_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
  ret void
}

define void @fmul_reductions() {
; Z15-LABEL: 'fmul_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
  ret void
}

define void @fast_fmul_reductions() {
; Z15-LABEL: 'fast_fmul_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)

  ret void
}

define void @fmin_reductions() {
; Z15-LABEL: 'fmin_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
  ret void
}

define void @fmax_reductions() {
; Z15-LABEL: 'fmax_reductions'
; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
;
  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
  ret void
}

define void @reduceumin() {
; Z15-LABEL: 'reduceumin'
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
;
  %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
  %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)

  %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
  %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)

  ret void
}

define void @reduceumax() {
; Z15-LABEL: 'reduceumax'
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
;
  %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)

  %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
  %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)

  ret void
}

define void @reducesmin() {
; Z15-LABEL: 'reducesmin'
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
;
  %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
  %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)

  %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
  %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)

  ret void
}

define void @reducesmax() {
; Z15-LABEL: 'reducesmax'
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
;
  %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
  %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)

  %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
  %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)

  ret void
}

define void @reduceadd() {
; Z15-LABEL: 'reduceadd'
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
;
; Z15-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; Z15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)

  ; REDUCEADD64
  %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
  %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
  %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
  ; REDUCEADD32
  %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
  %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
  %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
  ; REDUCEADD16
  %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
  %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
  %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
  %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
  ; REDUCEADD8
  %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
  %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
  %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
  %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
  ; EXTREME VALUES
  %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
  %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)

  ret void
}

define void @reducemul() {
; CHECK-LABEL: 'reducemul'
; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
;
; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; CHECK:  Cost Model: Found an estimated cost of 28 for instruction: %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)

  ; REDUCEADD64
  %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
  %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
  %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
  %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
  ; REDUCEADD32
  %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
  %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
  %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
  %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
  ; REDUCEADD16
  %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
  %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
  %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
  %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
  ; REDUCEADD8
  %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
  %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
  %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
  %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
  ; EXTREME VALUES
  %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
  %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)

  ret void
}

declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)

declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)

declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)

declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)

declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
declare i128 @llvm.vector.reduce.umin.v4i128(<4 x i128>)

declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
declare i128 @llvm.vector.reduce.umax.v4i128(<4 x i128>)

declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
declare i128 @llvm.vector.reduce.smin.v4i128(<4 x i128>)

declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
declare i128 @llvm.vector.reduce.smax.v4i128(<4 x i128>)

declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)

declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)

declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
declare i256 @llvm.vector.reduce.mul.v4i256(<4 x i256>)