aboutsummaryrefslogtreecommitdiff
path: root/test cases/common/147 simd
diff options
context:
space:
mode:
Diffstat (limited to 'test cases/common/147 simd')
-rw-r--r--test cases/common/147 simd/fallback.c8
-rw-r--r--test cases/common/147 simd/include/simdheader.h3
-rw-r--r--test cases/common/147 simd/meson.build44
-rw-r--r--test cases/common/147 simd/simd_avx.c49
-rw-r--r--test cases/common/147 simd/simd_avx2.c42
-rw-r--r--test cases/common/147 simd/simd_mmx.c67
-rw-r--r--test cases/common/147 simd/simd_neon.c20
-rw-r--r--test cases/common/147 simd/simd_sse.c29
-rw-r--r--test cases/common/147 simd/simd_sse2.c37
-rw-r--r--test cases/common/147 simd/simd_sse3.c38
-rw-r--r--test cases/common/147 simd/simd_sse41.c40
-rw-r--r--test cases/common/147 simd/simd_sse42.c43
-rw-r--r--test cases/common/147 simd/simd_ssse3.c48
-rw-r--r--test cases/common/147 simd/simdchecker.c143
-rw-r--r--test cases/common/147 simd/simdfuncs.h75
15 files changed, 686 insertions, 0 deletions
diff --git a/test cases/common/147 simd/fallback.c b/test cases/common/147 simd/fallback.c
new file mode 100644
index 0000000..ab435f4
--- /dev/null
+++ b/test cases/common/147 simd/fallback.c
@@ -0,0 +1,8 @@
+#include<simdfuncs.h>
+
+void increment_fallback(float arr[4]) {
+ int i;
+ for(i=0; i<4; i++) {
+ arr[i]++;
+ }
+}
diff --git a/test cases/common/147 simd/include/simdheader.h b/test cases/common/147 simd/include/simdheader.h
new file mode 100644
index 0000000..6515e41
--- /dev/null
+++ b/test cases/common/147 simd/include/simdheader.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#define I_CAN_HAZ_SIMD
diff --git a/test cases/common/147 simd/meson.build b/test cases/common/147 simd/meson.build
new file mode 100644
index 0000000..2628a12
--- /dev/null
+++ b/test cases/common/147 simd/meson.build
@@ -0,0 +1,44 @@
+project('simd', 'c')
+
+simd = import('unstable-simd')
+
+cc = meson.get_compiler('c')
+
+cdata = configuration_data()
+
+if not meson.is_cross_build() and host_machine.cpu_family() == 'arm' and cc.get_id() == 'clang'
+ message('Adding -march=armv7 because assuming that this build happens on Raspbian.')
+ message('Its Clang seems to be misconfigured and does not support NEON by default.')
+ add_project_arguments('-march=armv7', language : 'c')
+endif
+
+if cc.get_id() == 'msvc' and cc.version().version_compare('<17')
+ error('MESON_SKIP_TEST VS2010 produces broken binaries on x86.')
+endif
+
+# FIXME add [a, b] = function()
+rval = simd.check('mysimds',
+ mmx : 'simd_mmx.c',
+ sse : 'simd_sse.c',
+ sse2 : 'simd_sse2.c',
+ sse3 : 'simd_sse3.c',
+ ssse3 : 'simd_ssse3.c',
+ sse41 : 'simd_sse41.c',
+ sse42 : 'simd_sse42.c',
+ avx : 'simd_avx.c',
+ avx2 : 'simd_avx2.c',
+ neon : 'simd_neon.c',
+ compiler : cc,
+ include_directories : include_directories('include'))
+
+simdlibs = rval[0]
+cdata.merge_from(rval[1])
+
+configure_file(output : 'simdconfig.h',
+ configuration : cdata)
+
+p = executable('simdtest', 'simdchecker.c', 'fallback.c',
+ link_with : simdlibs)
+
+test('simdtest', p)
+
diff --git a/test cases/common/147 simd/simd_avx.c b/test cases/common/147 simd/simd_avx.c
new file mode 100644
index 0000000..5f45a4e
--- /dev/null
+++ b/test cases/common/147 simd/simd_avx.c
@@ -0,0 +1,49 @@
+#include<simdheader.h>
+
+#ifndef I_CAN_HAZ_SIMD
+#error The correct internal header was not used
+#endif
+
+#include<simdconfig.h>
+#include<simdfuncs.h>
+#include<stdint.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+int avx_available(void) {
+ return 1;
+}
+#else
+#include<immintrin.h>
+#include<cpuid.h>
+
+#ifdef __APPLE__
+/*
+ * Apple ships a broken __builtin_cpu_supports and
+ * some machines in the CI farm seem to be too
+ * old to have AVX so just always return 0 here.
+ */
+int avx_available(void) { return 0; }
+#else
+
+int avx_available(void) {
+ return __builtin_cpu_supports("avx");
+}
+#endif
+#endif
+
+void increment_avx(float arr[4]) {
+ double darr[4];
+ darr[0] = arr[0];
+ darr[1] = arr[1];
+ darr[2] = arr[2];
+ darr[3] = arr[3];
+ __m256d val = _mm256_loadu_pd(darr);
+ __m256d one = _mm256_set1_pd(1.0);
+ __m256d result = _mm256_add_pd(val, one);
+ _mm256_storeu_pd(darr, result);
+ arr[0] = (float)darr[0];
+ arr[1] = (float)darr[1];
+ arr[2] = (float)darr[2];
+ arr[3] = (float)darr[3];
+}
diff --git a/test cases/common/147 simd/simd_avx2.c b/test cases/common/147 simd/simd_avx2.c
new file mode 100644
index 0000000..c79819b
--- /dev/null
+++ b/test cases/common/147 simd/simd_avx2.c
@@ -0,0 +1,42 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+#include<stdint.h>
+
+/*
+ * FIXME add proper runtime detection for VS.
+ */
+
+#ifdef _MSC_VER
+#include<intrin.h>
+int avx2_available(void) {
+ return 0;
+}
+#else
+#include<immintrin.h>
+#include<cpuid.h>
+
+#if defined(__APPLE__)
+int avx2_available(void) { return 0; }
+#else
+int avx2_available(void) {
+ return __builtin_cpu_supports("avx2");
+}
+#endif
+#endif
+
+void increment_avx2(float arr[4]) {
+ double darr[4];
+ darr[0] = arr[0];
+ darr[1] = arr[1];
+ darr[2] = arr[2];
+ darr[3] = arr[3];
+ __m256d val = _mm256_loadu_pd(darr);
+ __m256d one = _mm256_set1_pd(1.0);
+ __m256d result = _mm256_add_pd(val, one);
+ _mm256_storeu_pd(darr, result);
+ one = _mm256_permute4x64_pd(one, 66); /* A no-op, just here to use AVX2. */
+ arr[0] = (float)darr[0];
+ arr[1] = (float)darr[1];
+ arr[2] = (float)darr[2];
+ arr[3] = (float)darr[3];
+}
diff --git a/test cases/common/147 simd/simd_mmx.c b/test cases/common/147 simd/simd_mmx.c
new file mode 100644
index 0000000..7605442
--- /dev/null
+++ b/test cases/common/147 simd/simd_mmx.c
@@ -0,0 +1,67 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#include<stdint.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+int mmx_available(void) {
+ return 1;
+}
+/* Contrary to MSDN documentation, MMX intrinsics
+ * just plain don't work.
+ */
+void increment_mmx(float arr[4]) {
+ arr[0]++;
+ arr[1]++;
+ arr[2]++;
+ arr[3]++;
+}
+#elif defined(__MINGW32__)
+int mmx_available(void) {
+ return 1;
+}
+/* MinGW does not seem to ship with MMX or it is broken.
+ */
+void increment_mmx(float arr[4]) {
+ arr[0]++;
+ arr[1]++;
+ arr[2]++;
+ arr[3]++;
+}
+#else
+#include<mmintrin.h>
+#include<cpuid.h>
+
+#if defined(__APPLE__)
+int mmx_available(void) { return 1; }
+#else
+int mmx_available(void) {
+ return __builtin_cpu_supports("mmx");
+}
+#endif
+void increment_mmx(float arr[4]) {
+ /* Super ugly but we know that values in arr are always small
+ * enough to fit in int16;
+ */
+ int i;
+ __m64 packed = _mm_set_pi16(arr[3], arr[2], arr[1], arr[0]);
+ __m64 incr = _mm_set1_pi16(1);
+ __m64 result = _mm_add_pi16(packed, incr);
+ /* Should be
+ * int64_t unpacker = _m_to_int64(result);
+ * but it does not exist on 32 bit platforms for some reason.
+ */
+ int64_t unpacker = (int64_t)(result);
+ _mm_empty();
+ for(i=0; i<4; i++) {
+ /* This fails on GCC 8 when optimizations are enabled.
+ * Disable it. Patches welcome to fix this.
+ arr[i] = (float)(unpacker & ((1<<16)-1));
+ unpacker >>= 16;
+ */
+ arr[i] += 1.0f;
+ }
+}
+
+#endif
diff --git a/test cases/common/147 simd/simd_neon.c b/test cases/common/147 simd/simd_neon.c
new file mode 100644
index 0000000..2834b30
--- /dev/null
+++ b/test cases/common/147 simd/simd_neon.c
@@ -0,0 +1,20 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#include<arm_neon.h>
+#include<stdint.h>
+
+int neon_available(void) {
+ return 1; /* Incorrect, but I don't know how to check this properly. */
+}
+
+void increment_neon(float arr[4]) {
+ float32x2_t a1, a2, one;
+ a1 = vld1_f32(arr);
+ a2 = vld1_f32(&arr[2]);
+ one = vdup_n_f32(1.0);
+ a1 = vadd_f32(a1, one);
+ a2 = vadd_f32(a2, one);
+ vst1_f32(arr, a1);
+ vst1_f32(&arr[2], a2);
+}
diff --git a/test cases/common/147 simd/simd_sse.c b/test cases/common/147 simd/simd_sse.c
new file mode 100644
index 0000000..6014e0c
--- /dev/null
+++ b/test cases/common/147 simd/simd_sse.c
@@ -0,0 +1,29 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+int sse_available(void) {
+ return 1;
+}
+#else
+
+#include<xmmintrin.h>
+#include<cpuid.h>
+#include<stdint.h>
+
+#if defined(__APPLE__)
+int sse_available(void) { return 1; }
+#else
+int sse_available(void) {
+ return __builtin_cpu_supports("sse");
+}
+#endif
+#endif
+
+void increment_sse(float arr[4]) {
+ __m128 val = _mm_load_ps(arr);
+ __m128 one = _mm_set_ps1(1.0);
+ __m128 result = _mm_add_ps(val, one);
+ _mm_storeu_ps(arr, result);
+}
diff --git a/test cases/common/147 simd/simd_sse2.c b/test cases/common/147 simd/simd_sse2.c
new file mode 100644
index 0000000..445afb6
--- /dev/null
+++ b/test cases/common/147 simd/simd_sse2.c
@@ -0,0 +1,37 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+#include<emmintrin.h>
+
+#ifdef _MSC_VER
+int sse2_available(void) {
+ return 1;
+}
+
+#else
+#include<cpuid.h>
+#include<stdint.h>
+
+#if defined(__APPLE__)
+int sse2_available(void) { return 1; }
+#else
+int sse2_available(void) {
+ return __builtin_cpu_supports("sse2");
+}
+#endif
+#endif
+
+void increment_sse2(float arr[4]) {
+ ALIGN_16 double darr[4];
+ __m128d val1 = _mm_set_pd(arr[0], arr[1]);
+ __m128d val2 = _mm_set_pd(arr[2], arr[3]);
+ __m128d one = _mm_set_pd(1.0, 1.0);
+ __m128d result = _mm_add_pd(val1, one);
+ _mm_store_pd(darr, result);
+ result = _mm_add_pd(val2, one);
+ _mm_store_pd(&darr[2], result);
+ arr[0] = (float)darr[1];
+ arr[1] = (float)darr[0];
+ arr[2] = (float)darr[3];
+ arr[3] = (float)darr[2];
+}
+
diff --git a/test cases/common/147 simd/simd_sse3.c b/test cases/common/147 simd/simd_sse3.c
new file mode 100644
index 0000000..29a35e6
--- /dev/null
+++ b/test cases/common/147 simd/simd_sse3.c
@@ -0,0 +1,38 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+int sse3_available(void) {
+ return 1;
+}
+#else
+
+#include<pmmintrin.h>
+#include<cpuid.h>
+#include<stdint.h>
+
+#if defined(__APPLE__)
+int sse3_available(void) { return 1; }
+#else
+int sse3_available(void) {
+ return __builtin_cpu_supports("sse3");
+}
+#endif
+#endif
+
+void increment_sse3(float arr[4]) {
+ ALIGN_16 double darr[4];
+ __m128d val1 = _mm_set_pd(arr[0], arr[1]);
+ __m128d val2 = _mm_set_pd(arr[2], arr[3]);
+ __m128d one = _mm_set_pd(1.0, 1.0);
+ __m128d result = _mm_add_pd(val1, one);
+ _mm_store_pd(darr, result);
+ result = _mm_add_pd(val2, one);
+ _mm_store_pd(&darr[2], result);
+ result = _mm_hadd_pd(val1, val2); /* This does nothing. Only here so we use an SSE3 instruction. */
+ arr[0] = (float)darr[1];
+ arr[1] = (float)darr[0];
+ arr[2] = (float)darr[3];
+ arr[3] = (float)darr[2];
+}
diff --git a/test cases/common/147 simd/simd_sse41.c b/test cases/common/147 simd/simd_sse41.c
new file mode 100644
index 0000000..29f2555
--- /dev/null
+++ b/test cases/common/147 simd/simd_sse41.c
@@ -0,0 +1,40 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#include<stdint.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+
+int sse41_available(void) {
+ return 1;
+}
+
+#else
+#include<smmintrin.h>
+#include<cpuid.h>
+
+#if defined(__APPLE__)
+int sse41_available(void) { return 1; }
+#else
+int sse41_available(void) {
+ return __builtin_cpu_supports("sse4.1");
+}
+#endif
+#endif
+
+void increment_sse41(float arr[4]) {
+ ALIGN_16 double darr[4];
+ __m128d val1 = _mm_set_pd(arr[0], arr[1]);
+ __m128d val2 = _mm_set_pd(arr[2], arr[3]);
+ __m128d one = _mm_set_pd(1.0, 1.0);
+ __m128d result = _mm_add_pd(val1, one);
+ result = _mm_ceil_pd(result); /* A no-op, only here to use a SSE4.1 intrinsic. */
+ _mm_store_pd(darr, result);
+ result = _mm_add_pd(val2, one);
+ _mm_store_pd(&darr[2], result);
+ arr[0] = (float)darr[1];
+ arr[1] = (float)darr[0];
+ arr[2] = (float)darr[3];
+ arr[3] = (float)darr[2];
+}
diff --git a/test cases/common/147 simd/simd_sse42.c b/test cases/common/147 simd/simd_sse42.c
new file mode 100644
index 0000000..f1564e2
--- /dev/null
+++ b/test cases/common/147 simd/simd_sse42.c
@@ -0,0 +1,43 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+#include<stdint.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+
+int sse42_available(void) {
+ return 1;
+}
+
+#else
+
+#include<nmmintrin.h>
+#include<cpuid.h>
+
+#ifdef __APPLE__
+int sse42_available(void) {
+ return 1;
+}
+#else
+int sse42_available(void) {
+ return __builtin_cpu_supports("sse4.2");
+}
+#endif
+
+#endif
+
+void increment_sse42(float arr[4]) {
+ ALIGN_16 double darr[4];
+ __m128d val1 = _mm_set_pd(arr[0], arr[1]);
+ __m128d val2 = _mm_set_pd(arr[2], arr[3]);
+ __m128d one = _mm_set_pd(1.0, 1.0);
+ __m128d result = _mm_add_pd(val1, one);
+ _mm_store_pd(darr, result);
+ result = _mm_add_pd(val2, one);
+ _mm_store_pd(&darr[2], result);
+ _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */
+ arr[0] = (float)darr[1];
+ arr[1] = (float)darr[0];
+ arr[2] = (float)darr[3];
+ arr[3] = (float)darr[2];
+}
diff --git a/test cases/common/147 simd/simd_ssse3.c b/test cases/common/147 simd/simd_ssse3.c
new file mode 100644
index 0000000..fa557f4
--- /dev/null
+++ b/test cases/common/147 simd/simd_ssse3.c
@@ -0,0 +1,48 @@
+#include<simdconfig.h>
+#include<simdfuncs.h>
+
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+#ifdef _MSC_VER
+#include<intrin.h>
+
+int ssse3_available(void) {
+ return 1;
+}
+
+#else
+
+#include<cpuid.h>
+#include<stdint.h>
+
+int ssse3_available(void) {
+#ifdef __APPLE__
+ return 1;
+#elif defined(__clang__)
+ /* https://github.com/numpy/numpy/issues/8130 */
+ return __builtin_cpu_supports("sse4.1");
+#else
+ return __builtin_cpu_supports("ssse3");
+#endif
+}
+
+#endif
+
+void increment_ssse3(float arr[4]) {
+ ALIGN_16 double darr[4];
+ __m128d val1 = _mm_set_pd(arr[0], arr[1]);
+ __m128d val2 = _mm_set_pd(arr[2], arr[3]);
+ __m128d one = _mm_set_pd(1.0, 1.0);
+ __m128d result = _mm_add_pd(val1, one);
+ __m128i tmp1, tmp2;
+ tmp1 = tmp2 = _mm_set1_epi16(0);
+ _mm_store_pd(darr, result);
+ result = _mm_add_pd(val2, one);
+ _mm_store_pd(&darr[2], result);
+ tmp1 = _mm_hadd_epi32(tmp1, tmp2); /* This does nothing. Only here so we use an SSSE3 instruction. */
+ arr[0] = (float)darr[1];
+ arr[1] = (float)darr[0];
+ arr[2] = (float)darr[3];
+ arr[3] = (float)darr[2];
+}
diff --git a/test cases/common/147 simd/simdchecker.c b/test cases/common/147 simd/simdchecker.c
new file mode 100644
index 0000000..c7a0a97
--- /dev/null
+++ b/test cases/common/147 simd/simdchecker.c
@@ -0,0 +1,143 @@
+#include<simdfuncs.h>
+#include<stdio.h>
+#include<string.h>
+
+typedef void (*simd_func)(float*);
+
+int check_simd_implementation(float *four,
+ const float *four_initial,
+ const char *simd_type,
+ const float *expected,
+ simd_func fptr,
+ const int blocksize) {
+ int rv = 0;
+ memcpy(four, four_initial, blocksize*sizeof(float));
+ printf("Using %s.\n", simd_type);
+ fptr(four);
+ for(int i=0; i<blocksize; i++) {
+ if(four[i] != expected[i]) {
+ printf("Increment function failed, got %f expected %f.\n", four[i], expected[i]);
+ rv = 1;
+ }
+ }
+ return rv;
+}
+
+int main(void) {
+ static const float four_initial[4] = {2.0, 3.0, 4.0, 5.0};
+ ALIGN_16 float four[4];
+ const float expected[4] = {3.0, 4.0, 5.0, 6.0};
+ int r=0;
+ const int blocksize = 4;
+
+/*
+ * Test all implementations that the current CPU supports.
+ */
+#if HAVE_NEON
+ if(neon_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "NEON",
+ expected,
+ increment_neon,
+ blocksize);
+ }
+#endif
+#if HAVE_AVX2
+ if(avx2_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "AVX2",
+ expected,
+ increment_avx2,
+ blocksize);
+ }
+#endif
+#if HAVE_AVX
+ if(avx_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "AVC",
+ expected,
+ increment_avx,
+ blocksize);
+ }
+#endif
+#if HAVE_SSE42
+ if(sse42_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSR42",
+ expected,
+ increment_sse42,
+ blocksize);
+ }
+#endif
+#if HAVE_SSE41
+ if(sse41_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSE41",
+ expected,
+ increment_sse41,
+ blocksize);
+ }
+#endif
+#if HAVE_SSSE3
+ if(ssse3_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSSE3",
+ expected,
+ increment_ssse3,
+ blocksize);
+ }
+#endif
+#if HAVE_SSE3
+ if(sse3_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSE3",
+ expected,
+ increment_sse3,
+ blocksize);
+ }
+#endif
+#if HAVE_SSE2
+ if(sse2_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSE2",
+ expected,
+ increment_sse2,
+ blocksize);
+ }
+#endif
+#if HAVE_SSE
+ if(sse_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "SSE",
+ expected,
+ increment_sse,
+ blocksize);
+ }
+#endif
+#if HAVE_MMX
+ if(mmx_available()) {
+ r += check_simd_implementation(four,
+ four_initial,
+ "MMX",
+ expected,
+ increment_mmx,
+ blocksize);
+ }
+#endif
+ r += check_simd_implementation(four,
+ four_initial,
+ "fallback",
+ expected,
+ increment_fallback,
+ blocksize);
+ return r;
+}
diff --git a/test cases/common/147 simd/simdfuncs.h b/test cases/common/147 simd/simdfuncs.h
new file mode 100644
index 0000000..d820f25
--- /dev/null
+++ b/test cases/common/147 simd/simdfuncs.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include<simdconfig.h>
+
+#ifdef _MSC_VER
+#define ALIGN_16 __declspec(align(16))
+#else
+#include<stdalign.h>
+#define ALIGN_16 alignas(16)
+#endif
+
+
+/* Yes, I do know that arr[4] decays into a pointer
+ * as a function argument. Don't do this in real code
+ * but for this test it is ok.
+ */
+
+void increment_fallback(float arr[4]);
+
+#if HAVE_MMX
+int mmx_available(void);
+void increment_mmx(float arr[4]);
+#endif
+
+#if HAVE_SSE
+int sse_available(void);
+void increment_sse(float arr[4]);
+#endif
+
+#if HAVE_SSE2
+int sse2_available(void);
+void increment_sse2(float arr[4]);
+#endif
+
+#if HAVE_SSE3
+int sse3_available(void);
+void increment_sse3(float arr[4]);
+#endif
+
+#if HAVE_SSSE3
+int ssse3_available(void);
+void increment_ssse3(float arr[4]);
+#endif
+
+#if HAVE_SSE41
+int sse41_available(void);
+void increment_sse41(float arr[4]);
+#endif
+
+#if HAVE_SSE42
+int sse42_available(void);
+void increment_sse42(float arr[4]);
+#endif
+
+#if HAVE_AVX
+int avx_available(void);
+void increment_avx(float arr[4]);
+#endif
+
+#if HAVE_AVX2
+int avx2_available(void);
+void increment_avx2(float arr[4]);
+#endif
+
+#if HAVE_NEON
+int neon_available(void);
+void increment_neon(float arr[4]);
+#endif
+
+#if HAVE_ALTIVEC
+int altivec_available(void);
+void increment_altivec(float arr[4]);
+#endif
+
+/* And so on. */