aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2023-03-06 15:35:37 +0800
committerliuhongt <hongtao.liu@intel.com>2023-05-30 17:53:58 +0800
commitd8545fb2c71683f407bfd96706103297d4d6e27b (patch)
tree0cccd887436470d455190f3ccba625f9ee0e3619 /gcc
parent365b1d5493988b6bd40183d1fe49bd8a3b32a6bb (diff)
downloadgcc-d8545fb2c71683f407bfd96706103297d4d6e27b.zip
gcc-d8545fb2c71683f407bfd96706103297d4d6e27b.tar.gz
gcc-d8545fb2c71683f407bfd96706103297d4d6e27b.tar.bz2
Detect bswap + rotate for byte permutation in pass_bswap.
The patch doesn't handle: 1. cast64_to_32, 2. memory source with rsize < range. gcc/ChangeLog: PR middle-end/108938 * gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New function, cut from original find_bswap_or_nop function. (find_bswap_or_nop): Add a new parameter, detect bswap + rotate and save rotate result in the new parameter. (bswap_replace): Add a new parameter to indicate rotate and generate rotate stmt if needed. (maybe_optimize_vector_constructor): Adjust for new rotate parameter in the upper 2 functions. (pass_optimize_bswap::execute): Ditto. (imm_store_chain_info::output_merged_store): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr108938-1.c: New test. * gcc.target/i386/pr108938-2.c: New test. * gcc.target/i386/pr108938-3.c: New test. * gcc.target/i386/pr108938-load-1.c: New test. * gcc.target/i386/pr108938-load-2.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/gimple-ssa-store-merging.cc130
-rw-r--r--gcc/testsuite/gcc.target/i386/pr108938-1.c79
-rw-r--r--gcc/testsuite/gcc.target/i386/pr108938-2.c35
-rw-r--r--gcc/testsuite/gcc.target/i386/pr108938-3.c26
-rw-r--r--gcc/testsuite/gcc.target/i386/pr108938-load-1.c69
-rw-r--r--gcc/testsuite/gcc.target/i386/pr108938-load-2.c30
6 files changed, 342 insertions, 27 deletions
diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index df7afd2..9cb574f 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
n->range *= BITS_PER_UNIT;
}
+/* Helper function for find_bswap_or_nop,
+ Return true if N is a swap or nop with MASK. */
+static bool
+is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
+ uint64_t cmpnop, uint64_t* mask,
+ bool* bswap)
+{
+ *mask = ~(uint64_t) 0;
+ if (n == cmpnop)
+ *bswap = false;
+ else if (n == cmpxchg)
+ *bswap = true;
+ else
+ {
+ int set = 0;
+ for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
+ if ((n & msk) == 0)
+ *mask &= ~msk;
+ else if ((n & msk) == (cmpxchg & msk))
+ set++;
+ else
+ return false;
+
+ if (set < 2)
+ return false;
+ *bswap = true;
+ }
+ return true;
+}
+
+
/* Check if STMT completes a bswap implementation or a read in a given
endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
accordingly. It also sets N to represent the kind of operations
@@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
gimple *
find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
- bool *cast64_to_32, uint64_t *mask)
+ bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
{
tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
if (!tree_fits_uhwi_p (type_size))
@@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
}
uint64_t cmpxchg, cmpnop;
+ uint64_t orig_range = n->range * BITS_PER_UNIT;
find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32);
/* A complete byte swap should make the symbolic number to start with
the largest digit in the highest order byte. Unchanged symbolic
number indicates a read with same endianness as target architecture. */
- *mask = ~(uint64_t) 0;
- if (n->n == cmpnop)
- *bswap = false;
- else if (n->n == cmpxchg)
- *bswap = true;
- else
+ *l_rotate = 0;
+ uint64_t tmp_n = n->n;
+ if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
{
- int set = 0;
- for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
- if ((n->n & msk) == 0)
- *mask &= ~msk;
- else if ((n->n & msk) == (cmpxchg & msk))
- set++;
- else
- return NULL;
- if (set < 2)
+ /* Try bswap + lrotate. */
+ /* TODO, handle cast64_to_32 and big/litte_endian memory
+ source when rsize < range. */
+ if (n->range == orig_range
+ && ((orig_range == 32
+ && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
+ || (orig_range == 64
+ && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
+ && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
+ {
+ uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER;
+ uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER;
+ /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */
+ if (!count)
+ {
+ for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++)
+ {
+ count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK;
+ if (count)
+ {
+ /* Count should be meaningful not 0xff. */
+ if (count <= range / BITS_PER_MARKER)
+ {
+ count = (count + i) * BITS_PER_MARKER % range;
+ break;
+ }
+ else
+ return NULL;
+ }
+ }
+ }
+ tmp_n = tmp_n >> count | tmp_n << (range - count);
+ if (orig_range == 32)
+ tmp_n &= (1ULL << 32) - 1;
+ if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
+ return NULL;
+ *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT;
+ gcc_assert (*bswap);
+ }
+ else
return NULL;
- *bswap = true;
}
/* Useless bit manipulation performed by code. */
@@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val,
tree
bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
tree bswap_type, tree load_type, struct symbolic_number *n,
- bool bswap, uint64_t mask)
+ bool bswap, uint64_t mask, uint64_t l_rotate)
{
tree src, tmp, tgt = NULL_TREE;
- gimple *bswap_stmt, *mask_stmt = NULL;
+ gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL;
tree_code conv_code = NOP_EXPR;
gimple *cur_stmt = gsi_stmt (gsi);
@@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
tmp = tgt;
}
+ if (l_rotate)
+ {
+ tree m = build_int_cst (bswap_type, l_rotate);
+ tmp = make_temp_ssa_name (bswap_type, NULL,
+ mask_stmt ? "bswapmaskdst" : "bswapdst");
+ gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+ rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m);
+ tmp = tgt;
+ }
+
/* Convert the result if necessary. */
if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
{
@@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT);
}
- gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp);
+ gimple_set_lhs (rotl_stmt ? rotl_stmt
+ : mask_stmt ? mask_stmt : bswap_stmt, tmp);
if (dump_file)
{
@@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
if (cur_stmt)
{
+ if (rotl_stmt)
+ gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
if (mask_stmt)
gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT);
gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
@@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
if (mask_stmt)
gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT);
+ if (rotl_stmt)
+ gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT);
}
return tgt;
}
@@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
}
bool cast64_to_32;
- uint64_t mask;
+ uint64_t mask, l_rotate;
gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
- &cast64_to_32, &mask);
+ &cast64_to_32, &mask, &l_rotate);
if (!ins_stmt
|| n.range != (unsigned HOST_WIDE_INT) sz
|| cast64_to_32
@@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt)
memset (&nop_stats, 0, sizeof (nop_stats));
memset (&bswap_stats, 0, sizeof (bswap_stats));
return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
- bswap_type, load_type, &n, bswap, mask) != NULL_TREE;
+ bswap_type, load_type, &n, bswap, mask,
+ l_rotate) != NULL_TREE;
}
/* Find manual byte swap implementations as well as load in a given
@@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun)
enum tree_code code;
struct symbolic_number n;
bool bswap, cast64_to_32;
- uint64_t mask;
+ uint64_t mask, l_rotate;
/* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
might be moved to a different basic block by bswap_replace and gsi
@@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun)
}
ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
- &cast64_to_32, &mask);
+ &cast64_to_32, &mask, &l_rotate);
if (!ins_stmt)
continue;
@@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun)
continue;
if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
- bswap_type, load_type, &n, bswap, mask))
+ bswap_type, load_type, &n, bswap, mask,
+ l_rotate))
changed = true;
}
}
@@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group)
}
bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
bswap_type, load_type, n, bswap,
- ~(uint64_t) 0);
+ ~(uint64_t) 0, 0);
gcc_assert (bswap_res);
}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c
new file mode 100644
index 0000000..67f245d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c
@@ -0,0 +1,79 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (uint64_t x)
+{
+ return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (uint64_t x)
+{
+ return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (uint64_t x)
+{
+ return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) |
+ (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) |
+ (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) |
+ (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) |
+ (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) |
+ (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) |
+ (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) |
+ (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48));
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (uint32_t x)
+{
+ return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (uint32_t x)
+{
+ return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_2 (uint32_t x)
+{
+ return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) |
+ (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) |
+ (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) |
+ (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c
new file mode 100644
index 0000000..47a2c89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-1.c"
+
+int main ()
+{
+ uint64_t a = 0x0807060504030201ULL;
+ uint64_t res = swap_rotate_64 (a);
+ if (res != 0x0203040506070801ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_1 (a);
+ if (res != 0x0203040506070800ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_2 (a);
+ if (res != 0x0203040506070001ULL)
+ __builtin_abort ();
+
+ uint32_t b = 0x04030201;
+ uint32_t res2 = swap_rotate_32 (b);
+ if (res2 != 0x03040102)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_1 (b);
+ if (res2 != 0x03040002)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_2 (b);
+ if (res2 != 0x03040100)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c
new file mode 100644
index 0000000..32ac544
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */
+
+void
+foo1 (char* a, unsigned int* __restrict b)
+{
+ a[0] = b[0] >> 24;
+ a[1] = b[0] >> 16;
+ a[2] = b[0] >> 8;
+ a[3] = b[0];
+ a[4] = b[1] >> 24;
+ a[5] = b[1] >> 16;
+ a[6] = b[1] >> 8;
+ a[7] = b[1];
+}
+
+void
+foo2 (char* a, short* __restrict b)
+{
+ a[0] = b[0] >> 8;
+ a[1] = b[0];
+ a[2] = b[1] >> 8;
+ a[3] = b[1];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
new file mode 100644
index 0000000..50d3a50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-movbe" } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */
+
+#include<stdint.h>
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 0 |
+ (uint64_t)(x[1]) << 56 |
+ (uint64_t)(x[2]) << 48 |
+ (uint64_t)(x[3]) << 40 |
+ (uint64_t)(x[4]) << 32 |
+ (uint64_t)(x[5]) << 24 |
+ (uint64_t)(x[6]) << 16 |
+ (uint64_t)(x[7]) << 8);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_1 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 24 |
+ (uint64_t)(x[1]) << 16 |
+ (uint64_t)(x[2]) << 8 |
+ (uint64_t)(0) << 0 |
+ (uint64_t)(x[4]) << 56 |
+ (uint64_t)(x[5]) << 48 |
+ (uint64_t)(x[6]) << 40 |
+ (uint64_t)(x[7]) << 32);
+}
+
+uint64_t
+__attribute__((noipa))
+swap_rotate_64_mask_2 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 0 |
+ (uint64_t)(x[1]) << 56 |
+ (uint64_t)(x[2]) << 48 |
+ (uint64_t)(0) << 40 |
+ (uint64_t)(x[4]) << 32 |
+ (uint64_t)(x[5]) << 24 |
+ (uint64_t)(x[6]) << 16 |
+ (uint64_t)(x[7]) << 8);
+}
+
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 8 |
+ (uint64_t)(x[1]) << 0 |
+ (uint64_t)(x[2]) << 24 |
+ (uint64_t)(x[3]) << 16);
+}
+
+uint32_t
+__attribute__((noipa))
+swap_rotate_32_mask_1 (unsigned char* x)
+{
+ return ((uint64_t)(x[0]) << 8 |
+ (uint64_t)(0) << 0 |
+ (uint64_t)(x[2]) << 24 |
+ (uint64_t)(x[3]) << 16);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
new file mode 100644
index 0000000..51a8102
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr108938-load-1.c"
+
+int main ()
+{
+ unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+ uint64_t res = swap_rotate_64 (a);
+ if (res != 0x0203040506070801ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_1 (a);
+ if (res != 0x0506070801020300ULL)
+ __builtin_abort ();
+
+ res = swap_rotate_64_mask_2 (a);
+ if (res != 0x0203000506070801ULL)
+ __builtin_abort ();
+
+ uint32_t res2 = swap_rotate_32 (a);
+ if (res2 != 0x03040102)
+ __builtin_abort ();
+
+ res2 = swap_rotate_32_mask_1 (a);
+ if (res2 != 0x03040100)
+ __builtin_abort ();
+
+ return 0;
+}