aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorUros Bizjak <ubizjak@gmail.com>2008-04-22 14:41:14 +0200
committerUros Bizjak <uros@gcc.gnu.org>2008-04-22 14:41:14 +0200
commit6126672ef1f0444459384a046133f1bfab10e728 (patch)
tree4fef4bb972ef64542388158dbedcdd6b87c048df /gcc
parente842d14ec8d365fa4e529600dd20b5140db2bd94 (diff)
downloadgcc-6126672ef1f0444459384a046133f1bfab10e728.zip
gcc-6126672ef1f0444459384a046133f1bfab10e728.tar.gz
gcc-6126672ef1f0444459384a046133f1bfab10e728.tar.bz2
re PR target/29096 (faster _mm_cvtpi32x2_ps for xmmintrin.h)
PR target/29096 * config/i386/xmmintrin.h (_mm_cvtpi16_ps): Rearrange calls to builtin functions to generate faster code. (_mm_cvtpu16_ps): Ditto. (_mm_cvtpi32x2_ps): Ditto. From-SVN: r134558
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/i386/xmmintrin.h24
2 files changed, 19 insertions, 13 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a0b29d6..d26292e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2008-04-22 Uros Bizjak <ubizjak@gmail.com>
+
+ PR target/29096
+ * config/i386/xmmintrin.h (_mm_cvtpi16_ps): Rearrange calls to
+ builtin functions to generate faster code.
+ (_mm_cvtpu16_ps): Ditto.
+ (_mm_cvtpi32x2_ps): Ditto.
+
2008-04-22 Nick Clifton <nickc@redhat.com>
* common.opt (ftree-loop-distribution): Add Optimization
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f176d74..fcfdaf9 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -621,7 +621,7 @@ _mm_cvtpi16_ps (__m64 __A)
{
__v4hi __sign;
__v2si __hisi, __losi;
- __v4sf __r;
+ __v4sf __zero, __ra, __rb;
/* This comparison against zero gives us a mask that can be used to
fill in the missing sign bits in the unpack operations below, so
@@ -633,12 +633,11 @@ _mm_cvtpi16_ps (__m64 __A)
__losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
/* Convert the doublewords to floating point two at a time. */
- __r = (__v4sf) _mm_setzero_ps ();
- __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
- __r = __builtin_ia32_movlhps (__r, __r);
- __r = __builtin_ia32_cvtpi2ps (__r, __losi);
+ __zero = (__v4sf) _mm_setzero_ps ();
+ __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
+ __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
- return (__m128) __r;
+ return (__m128) __builtin_ia32_movlhps (__ra, __rb);
}
/* Convert the four unsigned 16-bit values in A to SPFP form. */
@@ -646,19 +645,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
_mm_cvtpu16_ps (__m64 __A)
{
__v2si __hisi, __losi;
- __v4sf __r;
+ __v4sf __zero, __ra, __rb;
/* Convert the four words to doublewords. */
__hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
__losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
/* Convert the doublewords to floating point two at a time. */
- __r = (__v4sf) _mm_setzero_ps ();
- __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
- __r = __builtin_ia32_movlhps (__r, __r);
- __r = __builtin_ia32_cvtpi2ps (__r, __losi);
+ __zero = (__v4sf) _mm_setzero_ps ();
+ __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
+ __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
- return (__m128) __r;
+ return (__m128) __builtin_ia32_movlhps (__ra, __rb);
}
/* Convert the low four signed 8-bit values in A to SPFP form. */
@@ -692,7 +690,7 @@ _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
{
__v4sf __zero = (__v4sf) _mm_setzero_ps ();
__v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
- __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
+ __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
}