libitm/config/x86/unaligned.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

/* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Transactional Memory Library (libitm).

   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef LIBITM_X86_UNALIGNED_H
#define LIBITM_X86_UNALIGNED_H 1

#define HAVE_ARCH_UNALIGNED_LOAD2_U4 1
#define HAVE_ARCH_UNALIGNED_LOAD2_U8 1

#include "config/generic/unaligned.h"

namespace GTM HIDDEN {

template<>
inline uint32_t
unaligned_load2<uint32_t>(const gtm_cacheline *c1,
			  const gtm_cacheline *c2, size_t ofs)
{
  uint32_t r, lo, hi;
  lo = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  hi = c2->u32[0];
  asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  return r;
}

template<>
inline uint64_t
unaligned_load2<uint64_t>(const gtm_cacheline *c1,
			  const gtm_cacheline *c2, size_t ofs)
{
#ifdef __x86_64__
  uint64_t r, lo, hi;
  lo = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  hi = c2->u64[0];
  asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  return r;
#else
  uint32_t v0, v1, v2;
  uint64_t r;

  if (ofs < CACHELINE_SIZE - 4)
    {
      v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 2];
      v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
      v2 = c2->u32[0];
    }
  else
    {
      v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
      v1 = c2->u32[0];
      v2 = c2->u32[1];
    }
  ofs = (ofs & 3) * 8;
  asm("shrd %%cl, %[v1], %[v0]; shrd %%cl, %[v2], %[v1]"
      : "=A"(r) : "c"(ofs), [v0] "a"(v0), [v1] "d"(v1), [v2] "r"(v2));

  return r;
#endif
}

#if defined(__SSE2__) || defined(__MMX__)
template<>
inline _ITM_TYPE_M64
unaligned_load2<_ITM_TYPE_M64>(const gtm_cacheline *c1,
			       const gtm_cacheline *c2, size_t ofs)
{
# ifdef __x86_64__
  __m128i lo = _mm_movpi64_epi64 (c1->m64[CACHELINE_SIZE / 8 - 1]);
  __m128i hi = _mm_movpi64_epi64 (c2->m64[0]);

  ofs = (ofs & 7) * 8;
  lo = _mm_srli_epi64 (lo, ofs);
  hi = _mm_slli_epi64 (hi, 64 - ofs);
  lo = lo | hi;
  return _mm_movepi64_pi64 (lo);
# else
  // On 32-bit we're about to return the result in an MMX register, so go
  // ahead and do the computation in that unit, even if SSE2 is available.
  __m64 lo = c1->m64[CACHELINE_SIZE / 8 - 1];
  __m64 hi = c2->m64[0];

  ofs = (ofs & 7) * 8;
  lo = _mm_srli_si64 (lo, ofs);
  hi = _mm_slli_si64 (hi, 64 - ofs);
  return lo | hi;
# endif
}
#endif // SSE2 or MMX

// The SSE types are strictly aligned.
#ifdef __SSE__
template<>
  struct strict_alignment<_ITM_TYPE_M128>
    : public std::true_type
  { };

// Expand the unaligned SSE move instructions.
template<>
inline _ITM_TYPE_M128
unaligned_load<_ITM_TYPE_M128>(const void *t)
{
  return _mm_loadu_ps (static_cast<const float *>(t));
}

template<>
inline void
unaligned_store<_ITM_TYPE_M128>(void *t, _ITM_TYPE_M128 val)
{
  _mm_storeu_ps (static_cast<float *>(t), val);
}
#endif // SSE

#ifdef __AVX__
// The AVX types are strictly aligned when it comes to vmovaps vs vmovups.
template<>
  struct strict_alignment<_ITM_TYPE_M256>
    : public std::true_type
  { };

template<>
inline _ITM_TYPE_M256
unaligned_load<_ITM_TYPE_M256>(const void *t)
{
  return _mm256_loadu_ps (static_cast<const float *>(t));
}

template<>
inline void
unaligned_store<_ITM_TYPE_M256>(void *t, _ITM_TYPE_M256 val)
{
  _mm256_storeu_ps (static_cast<float *>(t), val);
}
#endif // AVX

#ifdef __XOP__
# define HAVE_ARCH_REALIGN_M128I 1
extern const __v16qi GTM_vpperm_shift[16];
inline __m128i
realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
{
  return _mm_perm_epi8 (lo, hi, GTM_vpperm_shift[byte_count]);
}
#elif defined(__AVX__)
# define HAVE_ARCH_REALIGN_M128I 1
extern "C" const uint64_t GTM_vpalignr_table[16];
inline __m128i
realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
{
  register __m128i xmm0 __asm__("xmm0") = hi;
  register __m128i xmm1 __asm__("xmm1") = lo;
  __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
	"r"(&GTM_vpalignr_table[byte_count]));
  return xmm0;
}
#elif defined(__SSSE3__)
# define HAVE_ARCH_REALIGN_M128I 1
extern "C" const uint64_t GTM_palignr_table[16];
inline __m128i
realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
{
  register __m128i xmm0 __asm__("xmm0") = hi;
  register __m128i xmm1 __asm__("xmm1") = lo;
  __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
	"r"(&GTM_palignr_table[byte_count]));
  return xmm0;
}
#elif defined(__SSE2__)
# define HAVE_ARCH_REALIGN_M128I 1
extern "C" const char GTM_pshift_table[16 * 16];
inline __m128i
realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
{
  register __m128i xmm0 __asm__("xmm0") = lo;
  register __m128i xmm1 __asm__("xmm1") = hi;
  __asm("call *%2" : "+x"(xmm0), "+x"(xmm1)
	: "r"(GTM_pshift_table + byte_count*16));
  return xmm0;
}
#endif // XOP, AVX, SSSE3, SSE2

#ifdef HAVE_ARCH_REALIGN_M128I
template<>
inline _ITM_TYPE_M128
unaligned_load2<_ITM_TYPE_M128>(const gtm_cacheline *c1,
				const gtm_cacheline *c2, size_t ofs)
{
  return (_ITM_TYPE_M128)
    realign_m128i (c1->m128i[CACHELINE_SIZE / 16 - 1],
		   c2->m128i[0], ofs & 15);
}
#endif // HAVE_ARCH_REALIGN_M128I

#ifdef __AVX__
template<>
inline _ITM_TYPE_M256
unaligned_load2<_ITM_TYPE_M256>(const gtm_cacheline *c1,
				const gtm_cacheline *c2, size_t ofs)
{
  __m128i v0, v1;
  __m256i r;

  v0 = (__m128i) unaligned_load2<_ITM_TYPE_M128>(c1, c2, ofs);
  if (ofs < CACHELINE_SIZE - 16)
    v1 = v0, v0 = _mm_loadu_si128 ((const __m128i *) &c1->b[ofs]);
  else
    v1 = _mm_loadu_si128((const __m128i *)&c2->b[ofs + 16 - CACHELINE_SIZE]);

  r = _mm256_castsi128_si256 ((__m128i)v0);
  r = _mm256_insertf128_si256 (r, (__m128i)v1, 1);
  return (_ITM_TYPE_M256) r;
}
#endif // AVX

} // namespace GTM

#endif // LIBITM_X86_UNALIGNED_H