aarch64: Split vec_selects of bottom elements into simple move

In certain intrinsics use cases GCC leaves SETs of a bottom-element vec select lying around: (vec_select:DI (reg:V2DI 34 v2 [orig:128 __o ] [128]) (parallel [ (const_int 0 [0]) ]))) This can be treated as a simple move in aarch64 when done between SIMD registers for all normal widths. These go through the aarch64_get_lane pattern. This patch adds a splitter there to simplify these extracts to a move that can, perhaps, be optimised a way. Another benefit is if the destination is memory we can use a simpler STR instruction rather than ST1-lane. gcc/ * config/aarch64/aarch64-simd.md (aarch64_get_lane<mode>): Convert to define_insn_and_split. Split into simple move when moving bottom element. gcc/testsuite/ * gcc.target/aarch64/vdup_lane_2.c: Scan for fmov rather than dup.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-01-20 18:11:20 +0000
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-01-20 19:29:42 +0000
commit: e140f5fd3e235c5a37dc99b79f37a5ad4dc59064 (patch)
tree: af6c82b56028d4c97ed7e3d44f0a1cf997f41d59 /gcc
parent: f8c677776617ab91826af1d950b00d853eaff622 (diff)
download: gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.zip
gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.tar.gz
gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.tar.bz2
2 files changed, 10 insertions, 2 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 41071b6..d7acd72 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3312,7 +3312,9 @@
 ;; Lane extraction of a value, neither sign nor zero extension
 ;; is guaranteed so upper bits should be considered undefined.
 ;; RTL uses GCC vector extension indices throughout so flip only for assembly.
-(define_insn "aarch64_get_lane<mode>"
+;; Extracting lane zero is split into a simple move when it is between SIMD
+;; registers or a store.
+(define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
 	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
@@ -3332,6 +3334,12 @@
 	  gcc_unreachable ();
       }
   }
+ "&& reload_completed
+  && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
+ [(set (match_dup 0) (match_dup 1))]
+ {
+   operands[1] = aarch64_replace_reg_mode (operands[1], <VEL>mode);
+ }
   [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")]
 )
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
index a49db3e..16f4808 100644
--- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
@@ -333,7 +333,7 @@ main ()
 
 /* Asm check for vdups_lane_f32, vdups_lane_s32, vdups_lane_u32.  */
 /* Can't generate "dup s<n>, v<m>[0]" for vdups_lane_s32 and vdups_lane_u32.  */
-/* { dg-final { scan-assembler-times "dup\\ts\[0-9\]+, v\[0-9\]+\.s\\\[0\\\]" 1} } */
+/* { dg-final { scan-assembler-times {fmov\ts0, s1} 1 } } */
 /* { dg-final { scan-assembler-times "dup\\ts\[0-9\]+, v\[0-9\]+\.s\\\[1\\\]" 3 } } */
 
 /* Asm check for vdupd_lane_f64, vdupd_lane_s64, vdupd_lane_u64.  */
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-01-20 18:11:20 +0000
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-01-20 19:29:42 +0000
commit	e140f5fd3e235c5a37dc99b79f37a5ad4dc59064 (patch)
tree	af6c82b56028d4c97ed7e3d44f0a1cf997f41d59 /gcc
parent	f8c677776617ab91826af1d950b00d853eaff622 (diff)
download	gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.zip gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.tar.gz gcc-e140f5fd3e235c5a37dc99b79f37a5ad4dc59064.tar.bz2