aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/sse.md
diff options
context:
space:
mode:
authorUros Bizjak <ubizjak@gmail.com>2021-12-01 23:01:09 +0100
committerUros Bizjak <ubizjak@gmail.com>2021-12-01 23:03:42 +0100
commit7eb961d83b0eda53aeb1cfaacdc367e1952de613 (patch)
tree21f566577f052fb2500939c1f09a4b95087aa8cf /gcc/config/i386/sse.md
parentde3e5aae6c4b540e808c822c1e878b0a3304d09c (diff)
downloadgcc-7eb961d83b0eda53aeb1cfaacdc367e1952de613.zip
gcc-7eb961d83b0eda53aeb1cfaacdc367e1952de613.tar.gz
gcc-7eb961d83b0eda53aeb1cfaacdc367e1952de613.tar.bz2
i386: Improve V8HI and V8HF inserts [PR102811]
Introduce vec_set_0 pattern for V8HI and V8HF modes to implement scalar element 0 inserts to from a GP register, SSE register or memory. Also add V8HI and V8HF AVX2 (x,x,x) alternative to PINSR insn pattern, which is split after reload to a sequence of PBROADCASTW and PBLENDW. The V8HF inserts from memory improve from: - vpbroadcastw 4(%esp), %xmm1 - vpblendw $16, %xmm1, %xmm0, %xmm0 + vpinsrw $4, 4(%esp), %xmm0, %xmm0 and V8HF inserts from SSE register to element 0 improve from: vpxor %xmm2, %xmm2, %xmm2 - vpbroadcastw %xmm0, %xmm0 vpblendw $1, %xmm0, %xmm2, %xmm0 Based on the above improvements, the register allocator is able to determine the optimal instruction (or instruction sequence) based on the register set of the input value, so there is no need to manually expand V8HI and V8HF inserts to the sequence of VEC_DUPLICATE and VEC_MERGE RTXes. 2021-12-01 Uroš Bizjak <ubizjak@gmail.com> gcc/ChangeLog: PR target/102811 * config/i386/sse.md (VI2F): Remove mode iterator. (VI2F_256_512): New mode iterator. (vec_set<V8_128:mode>_0): New insn pattern. (vec_set<VI2F_256_512:mode>_0>): Rename from vec_set<VI2F:mode>mode. Use VI2F_256_512 mode iterator instead of VI2F. (*axv512fp16_movsh): Remove. (<sse2p4_1>_pinsr<ssemodesuffix>): Add (x,x,x) AVX2 alternative. Do not disable V8HF mode insn on AVX2 targets. (pinsrw -> pbroadcast + pblendw peephole2): New peephole. (pinsrw -> pbroadcast + pblendw splitter): New post-reload splitter. * config/i386/i386.md (extendhfsf): Call gen_vec_setv8hf_0. * config/i386/i386-expand.c (ix86_expand_vector_set) <case E_V8HFmode>: Use vec_merge path for TARGET_AVX2. gcc/testsuite/ChangeLog: PR target/102881 * gcc.target/i386/pr102811-1.c: New test. * gcc.target/i386/avx512fp16-1c.c (dg-final): Update scan-assembler-times scan strings for ia32 targets. * gcc.target/i386/pr102327-1.c (dg-final): Ditto. * gcc.target/i386/pr102811.c: Rename from ... * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: ... this.
Diffstat (limited to 'gcc/config/i386/sse.md')
-rw-r--r--gcc/config/i386/sse.md170
1 files changed, 139 insertions, 31 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 08bdcdd..f8b34a1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -827,7 +827,7 @@
(V32HF "TARGET_AVX512BW")])
;; Int-float size matches
-(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF])
+(define_mode_iterator VI2F_256_512 [V16HI V32HI V16HF V32HF])
(define_mode_iterator VI4F_128 [V4SI V4SF])
(define_mode_iterator VI8F_128 [V2DI V2DF])
(define_mode_iterator VI4F_256 [V8SI V8SF])
@@ -10170,13 +10170,84 @@
]
(symbol_ref "true")))])
+(define_insn "vec_set<mode>_0"
+ [(set (match_operand:V8_128 0 "register_operand"
+ "=v,v,v,x,x,Yr,*x,x,x,x,v,v")
+ (vec_merge:V8_128
+ (vec_duplicate:V8_128
+ (match_operand:<ssescalarmode> 2 "nonimmediate_operand"
+ " r,m,v,r,m,Yr,*x,r,m,x,r,m"))
+ (match_operand:V8_128 1 "reg_or_0_operand"
+ " C,C,v,0,0,0 ,0 ,x,x,x,v,v")
+ (const_int 1)))]
+ "TARGET_SSE2"
+ "@
+ vmovw\t{%k2, %0|%0, %k2}
+ vmovw\t{%2, %0|%0, %2}
+ vmovsh\t{%2, %1, %0|%0, %1, %2}
+ pinsrw\t{$0, %k2, %0|%0, %k2, 0}
+ pinsrw\t{$0, %2, %0|%0, %2, 0}
+ pblendw\t{$1, %2, %0|%0, %2, 1}
+ pblendw\t{$1, %2, %0|%0, %2, 1}
+ vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0}
+ vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0}
+ vpblendw\t{$1, %2, %1, %0|%0, %1, %2, 1}
+ vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0}
+ vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0}"
+ [(set (attr "isa")
+ (cond [(eq_attr "alternative" "0,1,2")
+ (const_string "avx512fp16")
+ (eq_attr "alternative" "3")
+ (const_string "noavx")
+ (eq_attr "alternative" "4,5,6")
+ (const_string "sse4_noavx")
+ (eq_attr "alternative" "7,8,9")
+ (const_string "avx")
+ (eq_attr "alternative" "10,11")
+ (const_string "avx512bw")
+ ]
+ (const_string "*")))
+ (set (attr "type")
+ (if_then_else (eq_attr "alternative" "0,1,2,5,6,9")
+ (const_string "ssemov")
+ (const_string "sselog")))
+ (set (attr "prefix_data16")
+ (if_then_else (eq_attr "alternative" "3,4")
+ (const_string "1")
+ (const_string "*")))
+ (set (attr "prefix_extra")
+ (if_then_else (eq_attr "alternative" "5,6,7,8,9")
+ (const_string "1")
+ (const_string "*")))
+ (set (attr "length_immediate")
+ (if_then_else (eq_attr "alternative" "0,1,2")
+ (const_string "*")
+ (const_string "1")))
+ (set (attr "prefix")
+ (cond [(eq_attr "alternative" "0,1,2,10,11")
+ (const_string "evex")
+ (eq_attr "alternative" "7,8,9")
+ (const_string "vex")
+ ]
+ (const_string "orig")))
+ (set (attr "mode")
+ (if_then_else (eq_attr "alternative" "0,1,2")
+ (const_string "HF")
+ (const_string "TI")))
+ (set (attr "enabled")
+ (cond [(and (not (match_test "<MODE>mode == V8HFmode"))
+ (eq_attr "alternative" "2"))
+ (symbol_ref "false")
+ ]
+ (const_string "*")))])
+
;; vmovw clears also the higer bits
(define_insn "vec_set<mode>_0"
- [(set (match_operand:VI2F 0 "register_operand" "=v,v")
- (vec_merge:VI2F
- (vec_duplicate:VI2F
+ [(set (match_operand:VI2F_256_512 0 "register_operand" "=v,v")
+ (vec_merge:VI2F_256_512
+ (vec_duplicate:VI2F_256_512
(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m"))
- (match_operand:VI2F 1 "const0_operand" "C,C")
+ (match_operand:VI2F_256_512 1 "const0_operand" "C,C")
(const_int 1)))]
"TARGET_AVX512FP16"
"@
@@ -10186,19 +10257,6 @@
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
-(define_insn "*avx512fp16_movsh"
- [(set (match_operand:V8HF 0 "register_operand" "=v")
- (vec_merge:V8HF
- (vec_duplicate:V8HF
- (match_operand:HF 2 "register_operand" "v"))
- (match_operand:V8HF 1 "register_operand" "v")
- (const_int 1)))]
- "TARGET_AVX512FP16"
- "vmovsh\t{%2, %1, %0|%0, %1, %2}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix" "evex")
- (set_attr "mode" "HF")])
-
(define_insn "avx512fp16_movsh"
[(set (match_operand:V8HF 0 "register_operand" "=v")
(vec_merge:V8HF
@@ -17312,20 +17370,20 @@
(V4SI "avx512dq") (V2DI "avx512dq")])
;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
-;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better.
(define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
- [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v")
+ [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v,x")
(vec_merge:PINSR_MODE
(vec_duplicate:PINSR_MODE
- (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m,r,m,r,m"))
- (match_operand:PINSR_MODE 1 "register_operand" "0,0,x,x,v,v")
+ (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m,r,m,r,m,x"))
+ (match_operand:PINSR_MODE 1 "register_operand" "0,0,x,x,v,v,x")
(match_operand:SI 3 "const_int_operand")))]
"TARGET_SSE2
&& ((unsigned) exact_log2 (INTVAL (operands[3]))
- < GET_MODE_NUNITS (<MODE>mode))
- && !(<MODE>mode == V8HFmode && TARGET_AVX2)"
+ < GET_MODE_NUNITS (<MODE>mode))"
{
- operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+ HOST_WIDE_INT items = INTVAL (operands[3]);
+
+ operands[3] = GEN_INT (exact_log2 (items));
switch (which_alternative)
{
@@ -17343,33 +17401,83 @@
case 3:
case 5:
return "vpinsr<sseintmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ case 6:
+ /* This pattern needs to be shadowed with vec_set{v8hi,v8hf}_0. */
+ gcc_assert (items > 1);
+ return "#";
default:
gcc_unreachable ();
}
}
- [(set_attr "isa" "noavx,noavx,avx,avx,<pinsr_evex_isa>,<pinsr_evex_isa>")
+ [(set_attr "isa" "noavx,noavx,avx,avx,<pinsr_evex_isa>,<pinsr_evex_isa>,avx2")
(set_attr "type" "sselog")
(set (attr "prefix_rex")
(if_then_else
(and (not (match_test "TARGET_AVX"))
- (eq (const_string "<MODE>mode") (const_string "V2DImode")))
+ (match_test "GET_MODE_NUNITS (<MODE>mode) == 2"))
(const_string "1")
(const_string "*")))
(set (attr "prefix_data16")
(if_then_else
(and (not (match_test "TARGET_AVX"))
- (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+ (match_test "GET_MODE_NUNITS (<MODE>mode) == 8"))
(const_string "1")
(const_string "*")))
(set (attr "prefix_extra")
(if_then_else
(and (not (match_test "TARGET_AVX"))
- (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+ (match_test "GET_MODE_NUNITS (<MODE>mode) == 8"))
(const_string "*")
(const_string "1")))
(set_attr "length_immediate" "1")
- (set_attr "prefix" "orig,orig,vex,vex,evex,evex")
- (set_attr "mode" "TI")])
+ (set_attr "prefix" "orig,orig,vex,vex,evex,evex,vex")
+ (set_attr "mode" "TI")
+ (set (attr "enabled")
+ (cond [(and (not (match_test "GET_MODE_NUNITS (<MODE>mode) == 8"))
+ (eq_attr "alternative" "6"))
+ (symbol_ref "false")
+ ]
+ (const_string "*")))])
+
+;; For TARGET_AVX2, implement insert from XMM reg with PBROADCASTW + PBLENDW.
+;; First try to get a scratch register and go through it. In case this fails,
+;; overwrite source reg with broadcasted value and blend from there.
+(define_peephole2
+ [(match_scratch:V8_128 4 "x")
+ (set (match_operand:V8_128 0 "sse_reg_operand")
+ (vec_merge:V8_128
+ (vec_duplicate:V8_128
+ (match_operand:<ssescalarmode> 2 "sse_reg_operand"))
+ (match_operand:V8_128 1 "sse_reg_operand")
+ (match_operand:SI 3 "const_int_operand")))]
+ "TARGET_AVX2
+ && INTVAL (operands[3]) > 1
+ && ((unsigned) exact_log2 (INTVAL (operands[3]))
+ < GET_MODE_NUNITS (<MODE>mode))"
+ [(set (match_dup 4)
+ (vec_duplicate:V8_128 (match_dup 2)))
+ (set (match_dup 0)
+ (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))])
+
+(define_split
+ [(set (match_operand:V8_128 0 "sse_reg_operand")
+ (vec_merge:V8_128
+ (vec_duplicate:V8_128
+ (match_operand:<ssescalarmode> 2 "sse_reg_operand"))
+ (match_operand:V8_128 1 "sse_reg_operand")
+ (match_operand:SI 3 "const_int_operand")))]
+ "TARGET_AVX2 && epilogue_completed
+ && INTVAL (operands[3]) > 1
+ && ((unsigned) exact_log2 (INTVAL (operands[3]))
+ < GET_MODE_NUNITS (<MODE>mode))"
+ [(set (match_dup 4)
+ (vec_duplicate:V8_128 (match_dup 2)))
+ (set (match_dup 0)
+ (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))]
+{
+ operands[4] = lowpart_subreg (<MODE>mode, operands[2],
+ <ssescalarmode>mode);
+})
(define_expand "<extract_type>_vinsert<shuffletype><extract_suf>_mask"
[(match_operand:AVX512_VEC 0 "register_operand")