/* { dg-do run { target aarch64_sve256_hw } } */ /* { dg-options "-march=armv8-a+sve -msve-vector-bits=256 -fopenmp -O2" } */ #include #pragma omp declare reduction (+:svint32_t: omp_out = svadd_s32_z (svptrue_b32(), omp_in, omp_out)) \ initializer (omp_priv = svindex_s32 (0, 0)) void __attribute__ ((noipa)) parallel_reduction () { int a[8] = {1, 1, 1, 1, 1, 1, 1, 1}; int b[8] = {0, 0, 0, 0, 0, 0, 0, 0}; svint32_t va = svld1_s32 (svptrue_b32 (), b); int i = 0; int64_t res; #pragma omp parallel reduction (+:va, i) { va = svld1_s32 (svptrue_b32 (), a); i++; } res = svaddv_s32 (svptrue_b32 (), va); if (res != i * 8) __builtin_abort (); } void __attribute__ ((noipa)) for_reduction () { int a[8] = {1, 1, 1, 1, 1, 1, 1, 1}; int b[8] = {0, 0, 0, 0, 0, 0, 0, 0}; svint32_t va = svld1_s32 (svptrue_b32 (), b); int j; int64_t res; #pragma omp parallel for reduction (+:va) for (j = 0; j < 8; j++) va += svld1_s32 (svptrue_b32 (), a); res = svaddv_s32 (svptrue_b32 (), va); if (res != 64) __builtin_abort (); } void __attribute__ ((noipa)) simd_reduction () { int a[8]; svint32_t va = svindex_s32 (0, 0); int i = 0; int j; int64_t res = 0; for (j = 0; j < 8; j++) a[j] = 1; #pragma omp simd reduction (+:va) for (j = 0; j < 16; j++) va += svld1_s32 (svptrue_b32 (), a); res = svaddv_s32 (svptrue_b32 (), va); if (res != 128) __builtin_abort (); } void __attribute__ ((noipa)) inscan_reduction_incl () { svint32_t va = svindex_s32 (0, 0); int a[8] = {1, 1, 1, 1, 1, 1, 1, 1}; int b[64] = { 0 }; int j; int64_t res = 0; #pragma omp parallel for reduction (inscan, +:va) for (j = 0; j < 8; j++) { va += svld1_s32 (svptrue_b32 (), a); #pragma omp scan inclusive (va) svst1_s32 (svptrue_b32 (), b + j * 8, va); } res = svaddv_s32 (svptrue_b32 (), va); if (res != 64) __builtin_abort (); for (j = 0; j < 64; j+=8) if (b[j] != (j / 8 + 1)) __builtin_abort (); } void __attribute__ ((noipa)) inscan_reduction_excl () { svint32_t va = svindex_s32 (0, 0); int a[8] = {1, 1, 1, 1, 1, 1, 1, 1}; int b[64] = { 0 }; int j; int64_t res = 0; #pragma omp parallel for reduction (inscan, +:va) for (j = 0; j < 8; j++) { svst1_s32 (svptrue_b32 (), b + j * 8, va); #pragma omp scan exclusive (va) va += svld1_s32 (svptrue_b32 (), a); } res = svaddv_s32 (svptrue_b32 (), va); if (res != 64) __builtin_abort (); for (j = 0; j < 64; j+=8) if (b[j] != j / 8) __builtin_abort (); } int main () { parallel_reduction (); for_reduction (); simd_reduction (); inscan_reduction_incl (); inscan_reduction_excl (); }