diff options
Diffstat (limited to 'libitm/config/x86/unaligned.h')
-rw-r--r-- | libitm/config/x86/unaligned.h | 237 |
1 files changed, 0 insertions, 237 deletions
diff --git a/libitm/config/x86/unaligned.h b/libitm/config/x86/unaligned.h deleted file mode 100644 index 01abc47..0000000 --- a/libitm/config/x86/unaligned.h +++ /dev/null @@ -1,237 +0,0 @@ -/* Copyright (C) 2009, 2011 Free Software Foundation, Inc. - Contributed by Richard Henderson <rth@redhat.com>. - - This file is part of the GNU Transactional Memory Library (libitm). - - Libitm is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - Libitm is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for - more details. - - Under Section 7 of GPL version 3, you are granted additional - permissions described in the GCC Runtime Library Exception, version - 3.1, as published by the Free Software Foundation. - - You should have received a copy of the GNU General Public License and - a copy of the GCC Runtime Library Exception along with this program; - see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - <http://www.gnu.org/licenses/>. */ - -#ifndef LIBITM_X86_UNALIGNED_H -#define LIBITM_X86_UNALIGNED_H 1 - -#define HAVE_ARCH_UNALIGNED_LOAD2_U4 1 -#define HAVE_ARCH_UNALIGNED_LOAD2_U8 1 - -#include "config/generic/unaligned.h" - -namespace GTM HIDDEN { - -template<> -inline uint32_t -unaligned_load2<uint32_t>(const gtm_cacheline *c1, - const gtm_cacheline *c2, size_t ofs) -{ - uint32_t r, lo, hi; - lo = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1]; - hi = c2->u32[0]; - asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo)); - return r; -} - -template<> -inline uint64_t -unaligned_load2<uint64_t>(const gtm_cacheline *c1, - const gtm_cacheline *c2, size_t ofs) -{ -#ifdef __x86_64__ - uint64_t r, lo, hi; - lo = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1]; - hi = c2->u64[0]; - asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo)); - return r; -#else - uint32_t v0, v1, v2; - uint64_t r; - - if (ofs < CACHELINE_SIZE - 4) - { - v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 2]; - v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1]; - v2 = c2->u32[0]; - } - else - { - v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1]; - v1 = c2->u32[0]; - v2 = c2->u32[1]; - } - ofs = (ofs & 3) * 8; - asm("shrd %%cl, %[v1], %[v0]; shrd %%cl, %[v2], %[v1]" - : "=A"(r) : "c"(ofs), [v0] "a"(v0), [v1] "d"(v1), [v2] "r"(v2)); - - return r; -#endif -} - -#if defined(__SSE2__) || defined(__MMX__) -template<> -inline _ITM_TYPE_M64 -unaligned_load2<_ITM_TYPE_M64>(const gtm_cacheline *c1, - const gtm_cacheline *c2, size_t ofs) -{ -# ifdef __x86_64__ - __m128i lo = _mm_movpi64_epi64 (c1->m64[CACHELINE_SIZE / 8 - 1]); - __m128i hi = _mm_movpi64_epi64 (c2->m64[0]); - - ofs = (ofs & 7) * 8; - lo = _mm_srli_epi64 (lo, ofs); - hi = _mm_slli_epi64 (hi, 64 - ofs); - lo = lo | hi; - return _mm_movepi64_pi64 (lo); -# else - // On 32-bit we're about to return the result in an MMX register, so go - // ahead and do the computation in that unit, even if SSE2 is available. - __m64 lo = c1->m64[CACHELINE_SIZE / 8 - 1]; - __m64 hi = c2->m64[0]; - - ofs = (ofs & 7) * 8; - lo = _mm_srli_si64 (lo, ofs); - hi = _mm_slli_si64 (hi, 64 - ofs); - return lo | hi; -# endif -} -#endif // SSE2 or MMX - -// The SSE types are strictly aligned. -#ifdef __SSE__ -template<> - struct strict_alignment<_ITM_TYPE_M128> - : public std::true_type - { }; - -// Expand the unaligned SSE move instructions. -template<> -inline _ITM_TYPE_M128 -unaligned_load<_ITM_TYPE_M128>(const void *t) -{ - return _mm_loadu_ps (static_cast<const float *>(t)); -} - -template<> -inline void -unaligned_store<_ITM_TYPE_M128>(void *t, _ITM_TYPE_M128 val) -{ - _mm_storeu_ps (static_cast<float *>(t), val); -} -#endif // SSE - -#ifdef __AVX__ -// The AVX types are strictly aligned when it comes to vmovaps vs vmovups. -template<> - struct strict_alignment<_ITM_TYPE_M256> - : public std::true_type - { }; - -template<> -inline _ITM_TYPE_M256 -unaligned_load<_ITM_TYPE_M256>(const void *t) -{ - return _mm256_loadu_ps (static_cast<const float *>(t)); -} - -template<> -inline void -unaligned_store<_ITM_TYPE_M256>(void *t, _ITM_TYPE_M256 val) -{ - _mm256_storeu_ps (static_cast<float *>(t), val); -} -#endif // AVX - -#ifdef __XOP__ -# define HAVE_ARCH_REALIGN_M128I 1 -extern const __v16qi GTM_vpperm_shift[16]; -inline __m128i -realign_m128i (__m128i lo, __m128i hi, unsigned byte_count) -{ - return _mm_perm_epi8 (lo, hi, GTM_vpperm_shift[byte_count]); -} -#elif defined(__AVX__) -# define HAVE_ARCH_REALIGN_M128I 1 -extern "C" const uint64_t GTM_vpalignr_table[16]; -inline __m128i -realign_m128i (__m128i lo, __m128i hi, unsigned byte_count) -{ - register __m128i xmm0 __asm__("xmm0") = hi; - register __m128i xmm1 __asm__("xmm1") = lo; - __asm("call *%2" : "+x"(xmm0) : "x"(xmm1), - "r"(>M_vpalignr_table[byte_count])); - return xmm0; -} -#elif defined(__SSSE3__) -# define HAVE_ARCH_REALIGN_M128I 1 -extern "C" const uint64_t GTM_palignr_table[16]; -inline __m128i -realign_m128i (__m128i lo, __m128i hi, unsigned byte_count) -{ - register __m128i xmm0 __asm__("xmm0") = hi; - register __m128i xmm1 __asm__("xmm1") = lo; - __asm("call *%2" : "+x"(xmm0) : "x"(xmm1), - "r"(>M_palignr_table[byte_count])); - return xmm0; -} -#elif defined(__SSE2__) -# define HAVE_ARCH_REALIGN_M128I 1 -extern "C" const char GTM_pshift_table[16 * 16]; -inline __m128i -realign_m128i (__m128i lo, __m128i hi, unsigned byte_count) -{ - register __m128i xmm0 __asm__("xmm0") = lo; - register __m128i xmm1 __asm__("xmm1") = hi; - __asm("call *%2" : "+x"(xmm0), "+x"(xmm1) - : "r"(GTM_pshift_table + byte_count*16)); - return xmm0; -} -#endif // XOP, AVX, SSSE3, SSE2 - -#ifdef HAVE_ARCH_REALIGN_M128I -template<> -inline _ITM_TYPE_M128 -unaligned_load2<_ITM_TYPE_M128>(const gtm_cacheline *c1, - const gtm_cacheline *c2, size_t ofs) -{ - return (_ITM_TYPE_M128) - realign_m128i (c1->m128i[CACHELINE_SIZE / 16 - 1], - c2->m128i[0], ofs & 15); -} -#endif // HAVE_ARCH_REALIGN_M128I - -#ifdef __AVX__ -template<> -inline _ITM_TYPE_M256 -unaligned_load2<_ITM_TYPE_M256>(const gtm_cacheline *c1, - const gtm_cacheline *c2, size_t ofs) -{ - __m128i v0, v1; - __m256i r; - - v0 = (__m128i) unaligned_load2<_ITM_TYPE_M128>(c1, c2, ofs); - if (ofs < CACHELINE_SIZE - 16) - v1 = v0, v0 = _mm_loadu_si128 ((const __m128i *) &c1->b[ofs]); - else - v1 = _mm_loadu_si128((const __m128i *)&c2->b[ofs + 16 - CACHELINE_SIZE]); - - r = _mm256_castsi128_si256 ((__m128i)v0); - r = _mm256_insertf128_si256 (r, (__m128i)v1, 1); - return (_ITM_TYPE_M256) r; -} -#endif // AVX - -} // namespace GTM - -#endif // LIBITM_X86_UNALIGNED_H |