aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2023-08-04 16:23:38 +0100
committerRoger Sayle <roger@nextmovesoftware.com>2023-08-04 16:23:38 +0100
commitfaa2202ee7fcf039b2016ce5766a2927526c5f78 (patch)
treefe6ea681034f101e0a9e6a924a16fb5cbf525585
parent44e3f39a3d5f08ac4129c0558a90e297d2dd1e30 (diff)
downloadgcc-faa2202ee7fcf039b2016ce5766a2927526c5f78.zip
gcc-faa2202ee7fcf039b2016ce5766a2927526c5f78.tar.gz
gcc-faa2202ee7fcf039b2016ce5766a2927526c5f78.tar.bz2
i386: Split SUBREGs of SSE vector registers into vec_select insns.
This patch is the final piece in the series to improve the ABI issues affecting PR 88873. The previous patches tackled inserting DFmode values into V2DFmode registers, by introducing insvti_{low,high}part patterns. This patch improves the extraction of DFmode values from V2DFmode registers via TImode intermediates. I'd initially thought this would require new extvti_{low,high}part patterns to be defined, but all that's required is to recognize that the SUBREG idioms produced by combine are equivalent to (forms of) vec_select patterns. The target-independent middle-end can't be sure that the appropriate vec_select instruction exists on the target, hence doesn't canonicalize a SUBREG of a vector mode as a vec_select, but the backend can provide a define_split stating where and when this is useful, for example, considering whether the operand is in memory, or whether !TARGET_SSE_MATH and the destination is i387. For pr88873.c, gcc -O2 -march=cascadelake currently generates: foo: vpunpcklqdq %xmm3, %xmm2, %xmm7 vpunpcklqdq %xmm1, %xmm0, %xmm6 vpunpcklqdq %xmm5, %xmm4, %xmm2 vmovdqa %xmm7, -24(%rsp) vmovdqa %xmm6, %xmm1 movq -16(%rsp), %rax vpinsrq $1, %rax, %xmm7, %xmm4 vmovapd %xmm4, %xmm6 vfmadd132pd %xmm1, %xmm2, %xmm6 vmovapd %xmm6, -24(%rsp) vmovsd -16(%rsp), %xmm1 vmovsd -24(%rsp), %xmm0 ret with this patch, we now generate: foo: vpunpcklqdq %xmm1, %xmm0, %xmm6 vpunpcklqdq %xmm3, %xmm2, %xmm7 vpunpcklqdq %xmm5, %xmm4, %xmm2 vmovdqa %xmm6, %xmm1 vfmadd132pd %xmm7, %xmm2, %xmm1 vmovsd %xmm1, %xmm1, %xmm0 vunpckhpd %xmm1, %xmm1, %xmm1 ret The improvement is even more dramatic when compared to the original 29 instructions shown in comment #8. GCC 13, for example, required 12 transfers to/from memory. 2023-08-04 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/sse.md (define_split): Convert highpart:DF extract from V2DFmode register into a sse2_storehpd instruction. (define_split): Likewise, convert lowpart:DF extract from V2DF register into a sse2_storelpd instruction. gcc/testsuite/ChangeLog * gcc.target/i386/pr88873.c: Tweak to check for improved code.
-rw-r--r--gcc/config/i386/sse.md16
-rw-r--r--gcc/testsuite/gcc.target/i386/pr88873.c2
2 files changed, 18 insertions, 0 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ab455c3..f1712b0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13554,6 +13554,14 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
+;; Convert highpart SUBREG in sse2_storehpd or *vec_extractv2df_1_sse.
+(define_split
+ [(set (match_operand:DF 0 "register_operand")
+ (subreg:DF (match_operand:V2DF 1 "register_operand") 8))]
+ "TARGET_SSE"
+ [(set (match_dup 0)
+ (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))])
+
;; Avoid combining registers from different units in a single alternative,
;; see comment above inline_secondary_memory_needed function in i386.cc
(define_insn "sse2_storelpd"
@@ -13599,6 +13607,14 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
+;; Convert lowpart SUBREG into sse2_storelpd or *vec_extractv2df_0_sse.
+(define_split
+ [(set (match_operand:DF 0 "register_operand")
+ (subreg:DF (match_operand:V2DF 1 "register_operand") 0))]
+ "TARGET_SSE"
+ [(set (match_dup 0)
+ (vec_select:DF (match_dup 1) (parallel [(const_int 0)])))])
+
(define_expand "sse2_loadhpd_exp"
[(set (match_operand:V2DF 0 "nonimmediate_operand")
(vec_concat:V2DF
diff --git a/gcc/testsuite/gcc.target/i386/pr88873.c b/gcc/testsuite/gcc.target/i386/pr88873.c
index d893aac..a3a7ef2 100644
--- a/gcc/testsuite/gcc.target/i386/pr88873.c
+++ b/gcc/testsuite/gcc.target/i386/pr88873.c
@@ -9,3 +9,5 @@ s_t foo (s_t a, s_t b, s_t c)
}
/* { dg-final { scan-assembler-times "vpunpcklqdq" 3 } } */
+/* { dg-final { scan-assembler "vunpckhpd" } } */
+/* { dg-final { scan-assembler-not "rsp" } } */