aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2021-01-13 10:15:13 +0100
committerJakub Jelinek <jakub@redhat.com>2021-01-13 10:15:13 +0100
commit5d057bfeff70e5b8d00e521844c476f62d51e22c (patch)
tree4a2b273b233f5d809538bba3e983fe9a226ced9c /gcc
parent6b70fa678b1a5ecf3ee87e2be87c3dc9dd08cc92 (diff)
downloadgcc-5d057bfeff70e5b8d00e521844c476f62d51e22c.zip
gcc-5d057bfeff70e5b8d00e521844c476f62d51e22c.tar.gz
gcc-5d057bfeff70e5b8d00e521844c476f62d51e22c.tar.bz2
i386: Add define_insn_and_split patterns for btrl [PR96938]
In the following testcase we only optimize f2 and f7 to btrl, although we should optimize that way all of the functions. The problem is the type demotion/narrowing (which is performed solely during the generic folding and not later), without it we see the AND performed in SImode and match it as btrl, but with it while the shifts are still performed in SImode, the AND is already done in QImode or HImode low part of the shift. 2021-01-13 Jakub Jelinek <jakub@redhat.com> PR target/96938 * config/i386/i386.md (*btr<mode>_1, *btr<mode>_2): New define_insn_and_split patterns. (splitter after *btr<mode>_2): New splitter. * gcc.target/i386/pr96938.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.md65
-rw-r--r--gcc/testsuite/gcc.target/i386/pr96938.c66
2 files changed, 131 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c102312..b60784a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12419,6 +12419,71 @@
(match_dup 3)))
(clobber (reg:CC FLAGS_REG))])])
+(define_insn_and_split "*btr<mode>_1"
+ [(set (match_operand:SWI12 0 "register_operand")
+ (and:SWI12
+ (subreg:SWI12
+ (rotate:SI (const_int -2)
+ (match_operand:QI 2 "register_operand")) 0)
+ (match_operand:SWI12 1 "nonimmediate_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_USE_BT && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(parallel
+ [(set (match_dup 0)
+ (and:SI (rotate:SI (const_int -2) (match_dup 2))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+ operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+ if (MEM_P (operands[1]))
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
+})
+
+(define_insn_and_split "*btr<mode>_2"
+ [(set (zero_extract:HI
+ (match_operand:SWI12 0 "nonimmediate_operand")
+ (const_int 1)
+ (zero_extend:SI (match_operand:QI 1 "register_operand")))
+ (const_int 0))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_USE_BT && ix86_pre_reload_split ()"
+ "#"
+ "&& MEM_P (operands[0])"
+ [(set (match_dup 2) (match_dup 0))
+ (parallel
+ [(set (match_dup 3)
+ (and:SI (rotate:SI (const_int -2) (match_dup 1))
+ (match_dup 4)))
+ (clobber (reg:CC FLAGS_REG))])
+ (set (match_dup 0) (match_dup 5))]
+{
+ operands[2] = gen_reg_rtx (<MODE>mode);
+ operands[5] = gen_reg_rtx (<MODE>mode);
+ operands[3] = lowpart_subreg (SImode, operands[5], <MODE>mode);
+ operands[4] = lowpart_subreg (SImode, operands[2], <MODE>mode);
+})
+
+(define_split
+ [(set (zero_extract:HI
+ (match_operand:SWI12 0 "register_operand")
+ (const_int 1)
+ (zero_extend:SI (match_operand:QI 1 "register_operand")))
+ (const_int 0))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_USE_BT && ix86_pre_reload_split ()"
+ [(parallel
+ [(set (match_dup 0)
+ (and:SI (rotate:SI (const_int -2) (match_dup 1))
+ (match_dup 2)))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+ operands[2] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+ operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+})
+
;; These instructions are never faster than the corresponding
;; and/ior/xor operations when using immediate operand, so with
;; 32-bit there's no point. But in 64-bit, we can't hold the
diff --git a/gcc/testsuite/gcc.target/i386/pr96938.c b/gcc/testsuite/gcc.target/i386/pr96938.c
new file mode 100644
index 0000000..832cdd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96938.c
@@ -0,0 +1,66 @@
+/* PR target/96938 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "\tbtrl\t" 10 } } */
+
+void
+f1 (unsigned char *f, int o, unsigned char v)
+{
+ *f = (*f & ~(1 << o)) | (v << o);
+}
+
+void
+f2 (unsigned char *f, int o, unsigned char v)
+{
+ int t = *f & ~(1 << o);
+ *f = t | (v << o);
+}
+
+void
+f3 (unsigned char *f, int o, unsigned char v)
+{
+ *f &= ~(1 << o);
+}
+
+void
+f4 (unsigned char *f, int o, unsigned char v)
+{
+ *f = (*f & ~(1 << (o & 31))) | v;
+}
+
+void
+f5 (unsigned char *f, int o, unsigned char v)
+{
+ *f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
+}
+
+void
+f6 (unsigned short *f, int o, unsigned short v)
+{
+ *f = (*f & ~(1 << o)) | (v << o);
+}
+
+void
+f7 (unsigned short *f, int o, unsigned short v)
+{
+ int t = *f & ~(1 << o);
+ *f = t | (v << o);
+}
+
+void
+f8 (unsigned short *f, int o, unsigned short v)
+{
+ *f &= ~(1 << o);
+}
+
+void
+f9 (unsigned short *f, int o, unsigned short v)
+{
+ *f = (*f & ~(1 << (o & 31))) | v;
+}
+
+void
+f10 (unsigned short *f, int o, unsigned short v)
+{
+ *f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
+}