llvm/test/CodeGen/X86/combine-concatvectors.ll


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2

define void @PR32957(ptr %in, ptr %out) {
; CHECK-LABEL: PR32957:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
  %ld = load <2 x float>, ptr %in, align 8
  %ext = extractelement <2 x float> %ld, i64 0
  %ext2 = extractelement <2 x float> %ld, i64 1
  %ins = insertelement <8 x float> <float undef, float undef, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, float %ext, i64 0
  %ins2 = insertelement <8 x float> %ins, float %ext2, i64 1
  store <8 x float> %ins2, ptr %out, align 32
  ret void
}

declare { i8, double } @fun()

; Check that this does not fail to combine concat_vectors of a value from
; merge_values through a bitcast.
define void @d(i1 %cmp) {
; CHECK-LABEL: d:
; CHECK:       # %bb.0: # %bar
; CHECK-NEXT:    pushq %rax
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    callq fun@PLT
bar:
  %val = call { i8, double } @fun()
  %extr = extractvalue { i8, double } %val, 1
  %bc = bitcast double %extr to <2 x float>
  br label %baz

baz:
  %extr1 = extractelement <2 x float> %bc, i64 0
  unreachable
}

@qa_ = external unnamed_addr global [49216 x i8], align 32

define void @concat_of_broadcast_v2f64_v4f64() {
; AVX1-LABEL: concat_of_broadcast_v2f64_v4f64:
; AVX1:       # %bb.0: # %alloca_0
; AVX1-NEXT:    movq qa_@GOTPCREL(%rip), %rax
; AVX1-NEXT:    movl $1091567616, 30256(%rax) # imm = 0x41100000
; AVX1-NEXT:    movabsq $4294967297, %rcx # imm = 0x100000001
; AVX1-NEXT:    movq %rcx, 46348(%rax)
; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm0 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT:    vmovups %ymm0, 48296(%rax)
; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [7.812501848093234E-3,0.0E+0]
; AVX1-NEXT:    vmovsd %xmm0, 47372(%rax)
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: concat_of_broadcast_v2f64_v4f64:
; AVX2:       # %bb.0: # %alloca_0
; AVX2-NEXT:    movq qa_@GOTPCREL(%rip), %rax
; AVX2-NEXT:    movl $1091567616, 30256(%rax) # imm = 0x41100000
; AVX2-NEXT:    movabsq $4294967297, %rcx # imm = 0x100000001
; AVX2-NEXT:    movq %rcx, 46348(%rax)
; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX2-NEXT:    vmovups %ymm0, 48296(%rax)
; AVX2-NEXT:    vmovlps %xmm0, 47372(%rax)
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
alloca_0:
  store float 9.000000e+00, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 30256), align 16
  store <2 x i32> <i32 1, i32 1>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 46348), align 4
  br label %loop.4942

loop.4942:                                        ; preds = %loop.4942, %alloca_0
  br i1 poison, label %loop.4942, label %ifmerge.1298

ifmerge.1298:                                     ; preds = %loop.4942
  %gepload4638 = load float, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 28324), align 4
  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 48296), align 8
  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 48304), align 16
  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 48312), align 8
  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 48320), align 32
  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 47372), align 4
  ret void
}

define <4 x float> @concat_of_broadcast_v4f32_v8f32(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-LABEL: concat_of_broadcast_v4f32_v8f32:
; AVX1:       # %bb.0:
; AVX1-NEXT:    vmovaps (%rdi), %ymm0
; AVX1-NEXT:    vmovaps (%rsi), %ymm1
; AVX1-NEXT:    vmovaps (%rdx), %ymm2
; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,0],xmm0[0,0]
; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0]
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: concat_of_broadcast_v4f32_v8f32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovaps (%rdi), %ymm0
; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0]
; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [6,7,4,3]
; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
  %ld0 = load volatile <8 x float>, ptr %a0
  %ld1 = load volatile <8 x float>, ptr %a1
  %ld2 = load volatile <8 x float>, ptr %a2
  %shuffle = shufflevector <8 x float> %ld0, <8 x float> %ld1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 8>
  %shuffle1 = shufflevector <8 x float> %ld2, <8 x float> %shuffle, <4 x i32> <i32 6, i32 15, i32 12, i32 3>
  ret <4 x float> %shuffle1
}

define <4 x i64> @broadcast_of_shuffle_v2i64_v4i64(<16 x i8> %vecinit.i) {
; AVX1-LABEL: broadcast_of_shuffle_v2i64_v4i64:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    retq
;
; AVX2-LABEL: broadcast_of_shuffle_v2i64_v4i64:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
; AVX2-NEXT:    retq
entry:
  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> poison, <16 x i32> zeroinitializer
  %0 = bitcast <16 x i8> %vecinit15.i to <2 x i64>
  %1 = extractelement <2 x i64> %0, i64 0
  %2 = and i64 %1, -72057594037927936 ; 0xFF00 0000 0000 0000
  %3 = insertelement <4 x i64> poison, i64 %2, i64 0
  %4 = shufflevector <4 x i64> %3, <4 x i64> poison, <4 x i32> zeroinitializer
  ret <4 x i64> %4
}