/* { dg-do run } */ /* { dg-options "-O3 -save-temps" } */ #pragma GCC target "+nosve" #include #include #define DSIZE 16 #define PIXSIZE 64 extern void wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] + pix2[x]; pix1 += 16; pix2 += 16; } } extern void __attribute__((optimize (0))) wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] + pix2[x]; pix1 += 16; pix2 += 16; } } extern void wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] - pix2[x]; pix1 += 16; pix2 += 16; } } extern void __attribute__((optimize (0))) wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] - pix2[x]; pix1 += 16; pix2 += 16; } } extern void wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] * pix2[x]; pix1 += 16; pix2 += 16; } } extern void __attribute__((optimize (0))) wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 ) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] * pix2[x]; pix1 += 16; pix2 += 16; } } extern void wlshift (uint16_t *d, uint8_t *restrict pix1) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] << 8; pix1 += 16; } } extern void __attribute__((optimize (0))) wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1) { for (int y = 0; y < 4; y++ ) { for (int x = 0; x < 4; x++ ) d[x + y*4] = pix1[x] << 8; pix1 += 16; } } void __attribute__((optimize (0))) init_arrays (uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2) { for (int i = 0; i < DSIZE; i++) { d_a[i] = (1074 * i)%17; d_b[i] = (1074 * i)%17; } for (int i = 0; i < PIXSIZE; i++) { pix1[i] = (1024 * i)%17; pix2[i] = (1024 * i)%17; } } /* Don't optimize main so we don't get confused over where the vector instructions are generated. */ __attribute__((optimize (0))) int main () { uint16_t d_a[DSIZE]; uint16_t d_b[DSIZE]; uint8_t pix1[PIXSIZE]; uint8_t pix2[PIXSIZE]; init_arrays (d_a, d_b, pix1, pix2); wplus (d_a, pix1, pix2); wplus_no_opt (d_b, pix1, pix2); if (memcmp (d_a,d_b, DSIZE) != 0) return 1; init_arrays (d_a, d_b, pix1, pix2); wminus (d_a, pix1, pix2); wminus_no_opt (d_b, pix1, pix2); if (memcmp (d_a,d_b, DSIZE) != 0) return 2; init_arrays (d_a, d_b, pix1, pix2); wmult (d_a, pix1, pix2); wmult_no_opt (d_b, pix1, pix2); if (memcmp (d_a,d_b, DSIZE) != 0) return 3; init_arrays (d_a, d_b, pix1, pix2); wlshift (d_a, pix1); wlshift_no_opt (d_b, pix1); if (memcmp (d_a,d_b, DSIZE) != 0) return 4; } /* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */ /* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */ /* { dg-final { scan-assembler-times "umull\\tv" 2 } } */ /* { dg-final { scan-assembler-times "shll\\tv" 2 } } */