diff options
author | liuhongt <hongtao.liu@intel.com> | 2023-06-29 14:25:28 +0800 |
---|---|---|
committer | liuhongt <hongtao.liu@intel.com> | 2023-07-12 15:51:17 +0800 |
commit | 13c556d6ae84be3ee2bc245a56eafa58221de86a (patch) | |
tree | 6dfae0fb54ba52e779c48eb7bee544f22c55fcf3 /gcc | |
parent | a0cb65d34cc141571e870fb3b53b3ff47ae3338d (diff) | |
download | gcc-13c556d6ae84be3ee2bc245a56eafa58221de86a.zip gcc-13c556d6ae84be3ee2bc245a56eafa58221de86a.tar.gz gcc-13c556d6ae84be3ee2bc245a56eafa58221de86a.tar.bz2 |
Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.
gcc/ChangeLog:
PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
define_insn.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
define_insn_and_split to avoid false dependence.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
(<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot<mode>3): Ditto.
(iornot<mode>3): Ditto.
(*<nlogic><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110438.c: New test.
* gcc.target/i386/pr100711-6.c: Adjust testcase.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/predicates.md | 8 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 145 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100711-6.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr110438.c | 30 |
4 files changed, 168 insertions, 17 deletions
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01..37d20c6 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a209937..24359cd 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || <MODE_SIZE> == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ operands[3] = CONST0_RTX (<MODE>mode); }") -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ "@ vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (<MODE>mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17164,15 +17211,83 @@ operands[2] = force_reg (<MODE>mode, operands[2]); }) -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode == SImode || <ssescalarmode>mode == DImode)" { + if (!<mask_applied> && which_alternative + && optimize_insn_for_speed_p ()) + return "#"; + + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; + else + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; +} + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 3)) + (parallel + [(set (match_dup 0) + (xor:VI (match_dup 1) (match_dup 2))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] +{ + if (MEM_P (operands[1])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + { + if (GET_MODE_SIZE (<ssescalarmode>mode) < 4) + { + if (<MODE_SIZE> == 64 ? TARGET_AVX512BW + : (TARGET_AVX512BW && TARGET_AVX512VL) + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (<MODE>mode); + } + else + { + if (<MODE_SIZE> == 64 || TARGET_AVX512VL + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (<MODE>mode); + } + } +} + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_int 1)))]) + +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC"))) + (unspec [(match_operand:VI 3 "register_operand" "0,0,0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F" +{ if (TARGET_AVX512VL) return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; else @@ -17224,7 +17339,7 @@ [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") (and:VI (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] "TARGET_SSE && (register_operand (operands[1], <MODE>mode) || register_operand (operands[2], <MODE>mode))" @@ -17683,8 +17798,8 @@ [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") (ior:VI (not:VI - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17708,7 +17823,7 @@ (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "2,3") + (if_then_else (eq_attr "alternative" "0,1") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17716,8 +17831,8 @@ [(set (match_operand:VI 0 "register_operand" "=v,v") (not:VI (xor:VI - (match_operand:VI 1 "bcst_vector_operand" "%v,v") - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17736,7 +17851,7 @@ (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17747,8 +17862,8 @@ (define_insn "*<nlogic><mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v") (andor:VI - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17767,7 +17882,7 @@ (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) diff --git a/gcc/testsuite/gcc.target/i386/pr100711-6.c b/gcc/testsuite/gcc.target/i386/pr100711-6.c index 7142a98..8085074 100644 --- a/gcc/testsuite/gcc.target/i386/pr100711-6.c +++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -Os" } */ typedef int v16si __attribute__ ((vector_size (64))); typedef long long v8di __attribute__((vector_size (64))); diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 0000000..11b8cc5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include <immintrin.h> + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} |