diff options
author | Qiu Chaofan <qiucofan@cn.ibm.com> | 2020-11-10 10:52:13 +0800 |
---|---|---|
committer | Qiu Chaofan <qiucofan@cn.ibm.com> | 2020-11-10 10:52:13 +0800 |
commit | 979a4d268a48c27d9c0dce642912bcf648614ef8 (patch) | |
tree | abf4a662fece38f398715833d01ab2795c6c5ad8 | |
parent | 1cbf8e89b54de939420d53d7a528bec6fbaf0a55 (diff) | |
download | llvm-979a4d268a48c27d9c0dce642912bcf648614ef8.zip llvm-979a4d268a48c27d9c0dce642912bcf648614ef8.tar.gz llvm-979a4d268a48c27d9c0dce642912bcf648614ef8.tar.bz2 |
[PowerPC] [Clang] Port SSE4.1-compatible insert intrinsics
This patch adds three intrinsics compatible to x86's SSE 4.1 on PowerPC
target, with tests:
- _mm_insert_epi8
- _mm_insert_epi32
- _mm_insert_epi64
The intrinsics implementation is contributed by Paul Clarke.
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D89242
-rw-r--r-- | clang/lib/Headers/ppc_wrappers/smmintrin.h | 24 | ||||
-rw-r--r-- | clang/test/CodeGen/ppc-smmintrin.c | 29 |
2 files changed, 53 insertions, 0 deletions
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h index 56ef6ba..64f0c76 100644 --- a/clang/lib/Headers/ppc_wrappers/smmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -78,6 +78,30 @@ extern __inline __m128i return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); } +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { + __v16qi result = (__v16qi)__A; + result[__N & 0xf] = __D; + return (__m128i)result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { + __v4si result = (__v4si)__A; + result[__N & 3] = __D; + return (__m128i)result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { + __v2di result = (__v2di)__A; + result[__N & 1] = __D; + return (__m128i)result; +} + #else #include_next <smmintrin.h> #endif /* defined(__linux__) && defined(__ppc64__) */ diff --git a/clang/test/CodeGen/ppc-smmintrin.c b/clang/test/CodeGen/ppc-smmintrin.c index c3245ab..8deec9e 100644 --- a/clang/test/CodeGen/ppc-smmintrin.c +++ b/clang/test/CodeGen/ppc-smmintrin.c @@ -116,3 +116,32 @@ test_blend() { // CHECK-NEXT: [[REG80:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG76]], <16 x i8> [[REG78]], <16 x i8> [[REG79]]) // CHECK-NEXT: [[REG81:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG80]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[REG81]] + +void __attribute__((noinline)) +test_insert() { + _mm_insert_epi8(m1, 1, 0); + _mm_insert_epi32(m1, 1, 0); + _mm_insert_epi64(m1, 0xFFFFFFFF1L, 0); +} + +// CHECK-LABEL: @test_insert + +// CHECK: define available_externally <2 x i64> @_mm_insert_epi8(<2 x i64> {{[0-9a-zA-Z_%.]+}}, i32 signext {{[0-9a-zA-Z_%.]+}}, i32 signext {{[0-9a-zA-Z_%.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <16 x i8> +// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8 +// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15 +// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i8 %[[R0]], i32 %[[R1]] +// CHECK: %[[R2:[0-9a-zA-Z_.]+]] = bitcast <16 x i8> %{{[0-9a-zA-Z_.]+}} to <2 x i64> +// CHECK: ret <2 x i64> %[[R2]] + +// CHECK: define available_externally <2 x i64> @_mm_insert_epi32(<2 x i64> {{[0-9a-zA-Z_%.]+}}, i32 signext {{[0-9a-zA-Z_%.]+}}, i32 signext {{[0-9a-zA-Z_%.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <4 x i32> +// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3 +// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0]] +// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = bitcast <4 x i32> %{{[0-9a-zA-Z_.]+}} to <2 x i64> +// CHECK: ret <2 x i64> %[[R1]] + +// CHECK: define available_externally <2 x i64> @_mm_insert_epi64(<2 x i64> {{[0-9a-zA-Z_%.]+}}, i64 {{[0-9a-zA-Z_%.]+}}, i32 signext {{[0-9a-zA-Z_%.]+}}) +// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1 +// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0:[0-9a-zA-Z_.]+]] +// CHECK: ret <2 x i64> %{{[0-9a-zA-Z_.]+}} |