diff options
author | Uros Bizjak <ubizjak@gmail.com> | 2008-04-22 14:41:14 +0200 |
---|---|---|
committer | Uros Bizjak <uros@gcc.gnu.org> | 2008-04-22 14:41:14 +0200 |
commit | 6126672ef1f0444459384a046133f1bfab10e728 (patch) | |
tree | 4fef4bb972ef64542388158dbedcdd6b87c048df /gcc | |
parent | e842d14ec8d365fa4e529600dd20b5140db2bd94 (diff) | |
download | gcc-6126672ef1f0444459384a046133f1bfab10e728.zip gcc-6126672ef1f0444459384a046133f1bfab10e728.tar.gz gcc-6126672ef1f0444459384a046133f1bfab10e728.tar.bz2 |
re PR target/29096 (faster _mm_cvtpi32x2_ps for xmmintrin.h)
PR target/29096
* config/i386/xmmintrin.h (_mm_cvtpi16_ps): Rearrange calls to
builtin functions to generate faster code.
(_mm_cvtpu16_ps): Ditto.
(_mm_cvtpi32x2_ps): Ditto.
From-SVN: r134558
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 8 | ||||
-rw-r--r-- | gcc/config/i386/xmmintrin.h | 24 |
2 files changed, 19 insertions, 13 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a0b29d6..d26292e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2008-04-22 Uros Bizjak <ubizjak@gmail.com> + + PR target/29096 + * config/i386/xmmintrin.h (_mm_cvtpi16_ps): Rearrange calls to + builtin functions to generate faster code. + (_mm_cvtpu16_ps): Ditto. + (_mm_cvtpi32x2_ps): Ditto. + 2008-04-22 Nick Clifton <nickc@redhat.com> * common.opt (ftree-loop-distribution): Add Optimization diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index f176d74..fcfdaf9 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -621,7 +621,7 @@ _mm_cvtpi16_ps (__m64 __A) { __v4hi __sign; __v2si __hisi, __losi; - __v4sf __r; + __v4sf __zero, __ra, __rb; /* This comparison against zero gives us a mask that can be used to fill in the missing sign bits in the unpack operations below, so @@ -633,12 +633,11 @@ _mm_cvtpi16_ps (__m64 __A) __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); /* Convert the doublewords to floating point two at a time. */ - __r = (__v4sf) _mm_setzero_ps (); - __r = __builtin_ia32_cvtpi2ps (__r, __hisi); - __r = __builtin_ia32_movlhps (__r, __r); - __r = __builtin_ia32_cvtpi2ps (__r, __losi); + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __losi); - return (__m128) __r; + return (__m128) __builtin_ia32_movlhps (__ra, __rb); } /* Convert the four unsigned 16-bit values in A to SPFP form. */ @@ -646,19 +645,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_cvtpu16_ps (__m64 __A) { __v2si __hisi, __losi; - __v4sf __r; + __v4sf __zero, __ra, __rb; /* Convert the four words to doublewords. */ __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); /* Convert the doublewords to floating point two at a time. */ - __r = (__v4sf) _mm_setzero_ps (); - __r = __builtin_ia32_cvtpi2ps (__r, __hisi); - __r = __builtin_ia32_movlhps (__r, __r); - __r = __builtin_ia32_cvtpi2ps (__r, __losi); + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __losi); - return (__m128) __r; + return (__m128) __builtin_ia32_movlhps (__ra, __rb); } /* Convert the low four signed 8-bit values in A to SPFP form. */ @@ -692,7 +690,7 @@ _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf) _mm_setzero_ps (); __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); - __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); + __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); } |