From 53fe94a0ce262c6e38117429a30814f54ea55b0f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 4 Apr 2024 19:13:03 +0100 Subject: [CostModel][X86] Add costkinds test coverage for masked load/store/gather/scatter Noticed while starting triage for #87640 --- .../CostModel/X86/masked-intrinsic-codesize.ll | 2413 ++++++++++++++++++++ .../X86/masked-intrinsic-cost-inseltpoison.ll | 16 +- .../CostModel/X86/masked-intrinsic-cost.ll | 16 +- .../CostModel/X86/masked-intrinsic-latency.ll | 2413 ++++++++++++++++++++ .../CostModel/X86/masked-intrinsic-sizelatency.ll | 2413 ++++++++++++++++++++ 5 files changed, 7255 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll new file mode 100644 index 0000000..55fdaaf --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll @@ -0,0 +1,2413 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=skylake | FileCheck %s --check-prefixes=AVX,SKL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl | FileCheck %s --check-prefixes=AVX512,KNL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx | FileCheck %s --check-prefixes=AVX512,SKX + +define i32 @masked_load() { +; SSE2-LABEL: 'masked_load' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_load' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_load' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_load' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_load' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) + %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) + %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) + %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) + %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) + %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) + %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) + %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) + %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) + %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) + %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) + %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) + %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) + %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) + %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) + %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) + %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) + %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) + %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) + %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) + %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) + %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) + %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_store() { +; SSE2-LABEL: 'masked_store' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_store' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_store' +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_store' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_store' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) + call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_gather() { +; SSE2-LABEL: 'masked_gather' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_gather' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_gather' +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_gather' +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_gather' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_gather' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_gather' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_scatter() { +; SSE2-LABEL: 'masked_scatter' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_scatter' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_scatter' +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_scatter' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_scatter' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) + call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_expandload() { +; SSE2-LABEL: 'masked_expandload' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_expandload' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_expandload' +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_expandload' +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_compressstore() { +; SSE2-LABEL: 'masked_compressstore' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_compressstore' +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_compressstore' +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_compressstore' +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_compressstore' +; SKL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_compressstore' +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) + + call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) + call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) + + ret i32 0 +} + +define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) { +; SSE2-LABEL: 'test1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX-LABEL: 'test1' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test1' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) { +; SSE2-LABEL: 'test2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX-LABEL: 'test2' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX512-LABEL: 'test2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) { +; SSE2-LABEL: 'test3' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test3' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test3' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test3' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask) + ret void +} + +define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) { +; SSE2-LABEL: 'test4' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SSE42-LABEL: 'test4' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX1-LABEL: 'test4' +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX2-LABEL: 'test4' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SKL-LABEL: 'test4' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX512-LABEL: 'test4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) { +; SSE2-LABEL: 'test5' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test5' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test5' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test5' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) { +; SSE2-LABEL: 'test6' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test6' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test6' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test6' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) { +; SSE2-LABEL: 'test7' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; SSE42-LABEL: 'test7' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX-LABEL: 'test7' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX512-LABEL: 'test7' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { +; SSE2-LABEL: 'test8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; SSE42-LABEL: 'test8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX-LABEL: 'test8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX512-LABEL: 'test8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + +define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0) { +; SSE2-LABEL: 'test_gather_2f64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test_gather_2f64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX1-LABEL: 'test_gather_2f64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX2-LABEL: 'test_gather_2f64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SKL-LABEL: 'test_gather_2f64' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test_gather_2f64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} + +define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} + +define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_ra_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask2' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask2' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask2' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; SSE2-LABEL: 'test_scatter_16i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_16i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX1-LABEL: 'test_scatter_16i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX2-LABEL: 'test_scatter_16i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_16i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SKL-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_16i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_8i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_8i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_8i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_8i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_4i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { +; SSE2-LABEL: 'test_gather_4f32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) { +; SSE2-LABEL: 'test_gather_4f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>) +declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>) +declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>) +declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>) +declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>) +declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>) +declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>) +declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>) +declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>) +declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>) +declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>) +declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>) +declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>) +declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>) +declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>) +declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>) +declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>) + +declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>) +declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>) +declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>) +declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>) +declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>) +declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>) +declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) +declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>) +declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>) +declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>) +declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>) +declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>) +declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>) +declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>) +declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) +declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>) +declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>) +declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) +declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>) +declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>) + +declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) + +declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>) +declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>) + +declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>) +declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>) + +declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>) +declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>) diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll index 897344d..ad56c28 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2 -; -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=skylake | FileCheck %s --check-prefixes=AVX,SKL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=knl | FileCheck %s --check-prefixes=AVX512,KNL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=skx | FileCheck %s --check-prefixes=AVX512,SKX define i32 @masked_load() { ; SSE2-LABEL: 'masked_load' diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll index 5f22b2e..c7e7c46 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2 -; -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL -; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=skylake | FileCheck %s --check-prefixes=AVX,SKL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=knl | FileCheck %s --check-prefixes=AVX512,KNL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=skx | FileCheck %s --check-prefixes=AVX512,SKX define i32 @masked_load() { ; SSE2-LABEL: 'masked_load' diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll new file mode 100644 index 0000000..edb05ad --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll @@ -0,0 +1,2413 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=skylake | FileCheck %s --check-prefixes=AVX,SKL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=knl | FileCheck %s --check-prefixes=AVX512,KNL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=skx | FileCheck %s --check-prefixes=AVX512,SKX + +define i32 @masked_load() { +; SSE2-LABEL: 'masked_load' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_load' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_load' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_load' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_load' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) + %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) + %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) + %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) + %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) + %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) + %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) + %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) + %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) + %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) + %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) + %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) + %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) + %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) + %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) + %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) + %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) + %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) + %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) + %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) + %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) + %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) + %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_store() { +; SSE2-LABEL: 'masked_store' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_store' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_store' +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_store' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_store' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) + call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_gather() { +; SSE2-LABEL: 'masked_gather' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_gather' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_gather' +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_gather' +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_gather' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_gather' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_gather' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_scatter() { +; SSE2-LABEL: 'masked_scatter' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_scatter' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_scatter' +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_scatter' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_scatter' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) + call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_expandload() { +; SSE2-LABEL: 'masked_expandload' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_expandload' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_expandload' +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_expandload' +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_compressstore() { +; SSE2-LABEL: 'masked_compressstore' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_compressstore' +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_compressstore' +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_compressstore' +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_compressstore' +; SKL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_compressstore' +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) + + call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) + call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) + + ret i32 0 +} + +define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) { +; SSE2-LABEL: 'test1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX-LABEL: 'test1' +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test1' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) { +; SSE2-LABEL: 'test2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX-LABEL: 'test2' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX512-LABEL: 'test2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) { +; SSE2-LABEL: 'test3' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test3' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test3' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test3' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask) + ret void +} + +define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) { +; SSE2-LABEL: 'test4' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SSE42-LABEL: 'test4' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX1-LABEL: 'test4' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX2-LABEL: 'test4' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SKL-LABEL: 'test4' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX512-LABEL: 'test4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) { +; SSE2-LABEL: 'test5' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test5' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test5' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test5' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) { +; SSE2-LABEL: 'test6' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test6' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test6' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test6' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) { +; SSE2-LABEL: 'test7' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; SSE42-LABEL: 'test7' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX-LABEL: 'test7' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX512-LABEL: 'test7' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { +; SSE2-LABEL: 'test8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; SSE42-LABEL: 'test8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX-LABEL: 'test8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX512-LABEL: 'test8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + +define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0) { +; SSE2-LABEL: 'test_gather_2f64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test_gather_2f64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX1-LABEL: 'test_gather_2f64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX2-LABEL: 'test_gather_2f64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SKL-LABEL: 'test_gather_2f64' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test_gather_2f64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} + +define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} + +define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_ra_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask2' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask2' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask2' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; SSE2-LABEL: 'test_scatter_16i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_16i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX1-LABEL: 'test_scatter_16i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX2-LABEL: 'test_scatter_16i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_16i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SKL-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_16i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_8i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_8i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_8i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_8i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_4i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { +; SSE2-LABEL: 'test_gather_4f32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) { +; SSE2-LABEL: 'test_gather_4f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>) +declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>) +declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>) +declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>) +declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>) +declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>) +declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>) +declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>) +declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>) +declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>) +declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>) +declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>) +declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>) +declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>) +declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>) +declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>) +declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>) + +declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>) +declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>) +declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>) +declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>) +declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>) +declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>) +declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) +declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>) +declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>) +declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>) +declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>) +declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>) +declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>) +declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>) +declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) +declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>) +declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>) +declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) +declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>) +declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>) + +declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) + +declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>) +declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>) + +declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>) +declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>) + +declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>) +declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>) diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll new file mode 100644 index 0000000..3ebd9cc --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll @@ -0,0 +1,2413 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skylake | FileCheck %s --check-prefixes=AVX,SKL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl | FileCheck %s --check-prefixes=AVX512,KNL +; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx | FileCheck %s --check-prefixes=AVX512,SKX + +define i32 @masked_load() { +; SSE2-LABEL: 'masked_load' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_load' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_load' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_load' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_load' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef) + %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef) + %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef) + %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef) + %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef) + %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef) + %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef) + %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef) + %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef) + %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef) + %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef) + %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef) + %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef) + %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef) + %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef) + %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef) + %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef) + %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef) + %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef) + %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef) + %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef) + %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef) + %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_store() { +; SSE2-LABEL: 'masked_store' +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_store' +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_store' +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_store' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_store' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef) + call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef) + call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef) + call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef) + call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef) + call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef) + call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef) + call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef) + call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef) + call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef) + call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef) + call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef) + call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef) + call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef) + call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef) + call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef) + call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef) + call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_gather() { +; SSE2-LABEL: 'masked_gather' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_gather' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_gather' +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_gather' +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_gather' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_gather' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_gather' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_scatter() { +; SSE2-LABEL: 'masked_scatter' +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_scatter' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_scatter' +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; KNL-LABEL: 'masked_scatter' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_scatter' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) + + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) + + call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) + + call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef) + call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) + + ret i32 0 +} + +define i32 @masked_expandload() { +; SSE2-LABEL: 'masked_expandload' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_expandload' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX-LABEL: 'masked_expandload' +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_expandload' +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef) + + %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef) + + ret i32 0 +} + +define i32 @masked_compressstore() { +; SSE2-LABEL: 'masked_compressstore' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_compressstore' +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX1-LABEL: 'masked_compressstore' +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX2-LABEL: 'masked_compressstore' +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; SKL-LABEL: 'masked_compressstore' +; SKL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; +; AVX512-LABEL: 'masked_compressstore' +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef) + call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef) + + call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef) + call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef) + + call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef) + call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef) + + call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef) + call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef) + call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef) + call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef) + + ret i32 0 +} + +define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) { +; SSE2-LABEL: 'test1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX-LABEL: 'test1' +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test1' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) { +; SSE2-LABEL: 'test2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX-LABEL: 'test2' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX512-LABEL: 'test2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) { +; SSE2-LABEL: 'test3' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test3' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test3' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test3' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask) + ret void +} + +define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) { +; SSE2-LABEL: 'test4' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SSE42-LABEL: 'test4' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX1-LABEL: 'test4' +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX2-LABEL: 'test4' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; SKL-LABEL: 'test4' +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; +; AVX512-LABEL: 'test4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res +; + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) { +; SSE2-LABEL: 'test5' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test5' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test5' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test5' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) { +; SSE2-LABEL: 'test6' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test6' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test6' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test6' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask) + ret void +} + +define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) { +; SSE2-LABEL: 'test7' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; SSE42-LABEL: 'test7' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX-LABEL: 'test7' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; +; AVX512-LABEL: 'test7' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { +; SSE2-LABEL: 'test8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; SSE42-LABEL: 'test8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX-LABEL: 'test8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; +; AVX512-LABEL: 'test8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + +define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0) { +; SSE2-LABEL: 'test_gather_2f64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test_gather_2f64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX1-LABEL: 'test_gather_2f64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX2-LABEL: 'test_gather_2f64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; SKL-LABEL: 'test_gather_2f64' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; +; AVX512-LABEL: 'test_gather_2f64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res +; + %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} + +define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0) { +; SSE2-LABEL: 'test_gather_4i32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX1-LABEL: 'test_gather_4i32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; AVX2-LABEL: 'test_gather_4i32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res +; + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} + +define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { +; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_ra_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) { +; SSE2-LABEL: 'test_gather_16f32_const_mask2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SSE42-LABEL: 'test_gather_16f32_const_mask2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX1-LABEL: 'test_gather_16f32_const_mask2' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX2-LABEL: 'test_gather_16f32_const_mask2' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask2' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; +; AVX512-LABEL: 'test_gather_16f32_const_mask2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; SSE2-LABEL: 'test_scatter_16i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_16i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX1-LABEL: 'test_scatter_16i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX2-LABEL: 'test_scatter_16i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_16i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SKL-NEXT: Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_16i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_8i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_8i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_8i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512-LABEL: 'test_scatter_8i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { +; SSE2-LABEL: 'test_scatter_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX-LABEL: 'test_scatter_4i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { +; SSE2-LABEL: 'test_gather_4f32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) { +; SSE2-LABEL: 'test_gather_4f32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SSE42-LABEL: 'test_gather_4f32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX1-LABEL: 'test_gather_4f32_const_mask' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; AVX2-LABEL: 'test_gather_4f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>) +declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>) +declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>) +declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>) +declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>) +declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>) +declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>) +declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>) +declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>) +declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>) +declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>) +declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>) +declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>) +declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>) +declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>) +declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>) +declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>) + +declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>) +declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>) +declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>) +declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>) +declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>) +declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>) +declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) +declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>) +declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>) +declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>) +declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>) +declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>) +declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>) +declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>) +declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) +declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>) +declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>) +declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) +declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>) +declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>) + +declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>) +declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>) +declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>) +declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>) +declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>) +declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>) +declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>) +declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>) +declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>) +declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>) +declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) +declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>) +declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) +declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>) + +declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) + +declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>) +declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>) +declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>) + +declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>) +declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>) + +declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>) + +declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>) + +declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>) + +declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>) + +declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>) + +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>) + +declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>) + +declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>) +declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>) -- cgit v1.1