/* { dg-do compile } */ /* { dg-options "-O3 -mdejagnu-cpu=power9 -mvsx" } */ /* { dg-require-effective-target powerpc_vsx } */ /* Verify that we vectorize this SAD loop using vabsdub. */ extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)); static int foo (unsigned char *w, int i, unsigned char *x, int j) { int tot = 0; for (int a = 0; a < 16; a++) { for (int b = 0; b < 16; b++) tot += abs (w[b] - x[b]); w += i; x += j; } return tot; } void bar (unsigned char *w, unsigned char *x, int i, int *result) { *result = foo (w, 16, x, i); } /* { dg-final { scan-assembler-times "vabsdub" 16 } } */ /* { dg-final { scan-assembler-times "vsum4ubs" 16 } } */ /* { dg-final { scan-assembler-times "vadduwm" 17 } } */ /* Note: One of the 16 adds is optimized out (add with zero), leaving 15. The extra two adds are for the final reduction. */