aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorSrinath Parvathaneni <srinath.parvathaneni@arm.com>2020-06-04 15:41:29 +0100
committerSrinath Parvathaneni <srinath.parvathaneni@arm.com>2020-06-04 15:55:31 +0100
commit9a810e57c4e6af54d29c325a013f451ade2b85e8 (patch)
tree0e8b8a988b085a69d81f97f8137125af5ef904b5 /gcc
parentd34f510e2bf976cff3b9fbf7a8c5a41c233db2e4 (diff)
downloadgcc-9a810e57c4e6af54d29c325a013f451ade2b85e8.zip
gcc-9a810e57c4e6af54d29c325a013f451ade2b85e8.tar.gz
gcc-9a810e57c4e6af54d29c325a013f451ade2b85e8.tar.bz2
[ARM]: Correct the grouping of operands in MVE vector scatter store intrinsics (PR94735).
The operands in RTL patterns of MVE vector scatter store intrinsics are wrongly grouped, because of which few vector loads and stores instructions are wrongly getting optimized out with -O2. A new predicate "mve_scatter_memory" is defined in this patch, this predicate returns TRUE on matching: (mem(reg)) for MVE scatter store intrinsics. This patch fixes the issue by adding define_expand pattern with "mve_scatter_memory" predicate and calls the corresponding define_insn by passing register_operand as first argument. This register_operand is extracted from the operand with "mve_scatter_memory" predicate in define_expand pattern. gcc/ChangeLog: 2020-06-01 Srinath Parvathaneni <srinath.parvathaneni@arm.com> PR target/94735 * config/arm/predicates.md (mve_scatter_memory): Define to match (mem (reg)) for scatter store memory. * config/arm/mve.md (mve_vstrbq_scatter_offset_<supf><mode>): Modify define_insn to define_expand. (mve_vstrbq_scatter_offset_p_<supf><mode>): Likewise. (mve_vstrhq_scatter_offset_<supf><mode>): Likewise. (mve_vstrhq_scatter_shifted_offset_p_<supf><mode>): Likewise. (mve_vstrhq_scatter_shifted_offset_<supf><mode>): Likewise. (mve_vstrdq_scatter_offset_p_<supf>v2di): Likewise. (mve_vstrdq_scatter_offset_<supf>v2di): Likewise. (mve_vstrdq_scatter_shifted_offset_p_<supf>v2di): Likewise. (mve_vstrdq_scatter_shifted_offset_<supf>v2di): Likewise. (mve_vstrhq_scatter_offset_fv8hf): Likewise. (mve_vstrhq_scatter_offset_p_fv8hf): Likewise. (mve_vstrhq_scatter_shifted_offset_fv8hf): Likewise. (mve_vstrhq_scatter_shifted_offset_p_fv8hf): Likewise. (mve_vstrwq_scatter_offset_fv4sf): Likewise. (mve_vstrwq_scatter_offset_p_fv4sf): Likewise. (mve_vstrwq_scatter_offset_p_<supf>v4si): Likewise. (mve_vstrwq_scatter_offset_<supf>v4si): Likewise. (mve_vstrwq_scatter_shifted_offset_fv4sf): Likewise. (mve_vstrwq_scatter_shifted_offset_p_fv4sf): Likewise. (mve_vstrwq_scatter_shifted_offset_p_<supf>v4si): Likewise. (mve_vstrwq_scatter_shifted_offset_<supf>v4si): Likewise. (mve_vstrbq_scatter_offset_<supf><mode>_insn): Define insn for scatter stores. (mve_vstrbq_scatter_offset_p_<supf><mode>_insn): Likewise. (mve_vstrhq_scatter_offset_<supf><mode>_insn): Likewise. (mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn): Likewise. (mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn): Likewise. (mve_vstrdq_scatter_offset_p_<supf>v2di_insn): Likewise. (mve_vstrdq_scatter_offset_<supf>v2di_insn): Likewise. (mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn): Likewise. (mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn): Likewise. (mve_vstrhq_scatter_offset_fv8hf_insn): Likewise. (mve_vstrhq_scatter_offset_p_fv8hf_insn): Likewise. (mve_vstrhq_scatter_shifted_offset_fv8hf_insn): Likewise. (mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn): Likewise. (mve_vstrwq_scatter_offset_fv4sf_insn): Likewise. (mve_vstrwq_scatter_offset_p_fv4sf_insn): Likewise. (mve_vstrwq_scatter_offset_p_<supf>v4si_insn): Likewise. (mve_vstrwq_scatter_offset_<supf>v4si_insn): Likewise. (mve_vstrwq_scatter_shifted_offset_fv4sf_insn): Likewise. (mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn): Likewise. (mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn): Likewise. (mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn): Likewise. gcc/testsuite/ChangeLog: 2020-06-01 Srinath Parvathaneni <srinath.parvathaneni@arm.com> PR target/94735 * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c: New test. * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c: Likewise. * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c: Likewise. * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c: Likewise. * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c: Likewise. * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/arm/mve.md828
-rw-r--r--gcc/config/arm/predicates.md6
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c67
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c69
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c215
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c216
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c141
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c142
8 files changed, 1363 insertions, 321 deletions
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 986fbfe..3a57901 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -8102,22 +8102,29 @@
;;
;; [vstrbq_scatter_offset_s vstrbq_scatter_offset_u]
;;
-(define_insn "mve_vstrbq_scatter_offset_<supf><mode>"
- [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_B_ELEM>
- [(match_operand:MVE_2 1 "s_register_operand" "w")
- (match_operand:MVE_2 2 "s_register_operand" "w")]
- VSTRBSOQ))
- ]
+(define_expand "mve_vstrbq_scatter_offset_<supf><mode>"
+ [(match_operand:<MVE_B_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn("vstrb.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrbq_scatter_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrbq_scatter_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_2 1 "s_register_operand" "w")
+ (match_operand:MVE_2 2 "s_register_operand" "w")]
+ VSTRBSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrb.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
@@ -8210,23 +8217,33 @@
;;
;; [vstrbq_scatter_offset_p_s vstrbq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_B_ELEM>
- [(match_operand:MVE_2 1 "s_register_operand" "w")
- (match_operand:MVE_2 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRBSOQ))
- ]
+(define_expand "mve_vstrbq_scatter_offset_p_<supf><mode>"
+ [(match_operand:<MVE_B_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand" "Up")
+ (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrbt.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrbq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_2 1 "s_register_operand" "w")
+ (match_operand:MVE_2 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRBSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrbt.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
@@ -9097,87 +9114,122 @@
;;
;; [vstrhq_scatter_offset_p_s vstrhq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_offset_p_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_offset_s vstrhq_scatter_offset_u]
;;
-(define_insn "mve_vstrhq_scatter_offset_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")]
- VSTRHSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_offset_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")]
+ VSTRHSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrh.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_shifted_offset_p_s vstrhq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Ux")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHSSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_shifted_offset_s vstrhq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")]
- VSTRHSSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")]
+ VSTRHSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrh.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "4")])
;;
@@ -9345,173 +9397,240 @@
;;
;; [vstrdq_scatter_offset_p_s vstrdq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRDSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_offset_p_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrdq_scatter_offset_p_<supf>v2di_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRDSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrdt.64\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrdq_scatter_offset_s vstrdq_scatter_offset_u]
;;
-(define_insn "mve_vstrdq_scatter_offset_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")]
- VSTRDSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_offset_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrd.64\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrdq_scatter_offset_<supf>v2di_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_offset_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")]
+ VSTRDSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrd.64\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrdq_scatter_shifted_offset_p_s vstrdq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRDSSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1, UXTW #3]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRDSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrdt.64\t%q2, [%0, %q1, UXTW #3]"
[(set_attr "length" "8")])
;;
;; [vstrdq_scatter_shifted_offset_s vstrdq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")]
- VSTRDSSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrd.64\t%q2, [%m0, %q1, UXTW #3]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")]
+ VSTRDSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrd.64\t%q2, [%0, %q1, UXTW #3]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_offset_f]
;;
-(define_insn "mve_vstrhq_scatter_offset_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")]
- VSTRHQSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_offset_fv8hf"
+ [(match_operand:V8HI 0 "mve_scatter_memory")
+ (match_operand:V8HI 1 "s_register_operand")
+ (match_operand:V8HF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.16\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_fv8hf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")]
+ VSTRHQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrh.16\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_offset_p_f]
;;
-(define_insn "mve_vstrhq_scatter_offset_p_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHQSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_offset_p_fv8hf"
+ [(match_operand:V8HI 0 "mve_scatter_memory")
+ (match_operand:V8HI 1 "s_register_operand")
+ (match_operand:V8HF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_p_fv8hf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_p_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrht.16\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_shifted_offset_f]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")]
- VSTRHQSSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_fv8hf"
+ [(match_operand:V8HI 0 "memory_operand" "=Us")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.16\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_shifted_offset_fv8hf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")]
+ VSTRHQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrh.16\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_shifted_offset_p_f]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHQSSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
+ [(match_operand:V8HI 0 "memory_operand" "=Us")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")
+ (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrht.16\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "8")])
;;
@@ -9562,173 +9681,240 @@
;;
;; [vstrwq_scatter_offset_f]
;;
-(define_insn "mve_vstrwq_scatter_offset_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")]
- VSTRWQSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_offset_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_fv4sf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")]
+ VSTRWQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrw.32\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrwq_scatter_offset_p_f]
;;
-(define_insn "mve_vstrwq_scatter_offset_p_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWQSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_offset_p_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_p_fv4sf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_p_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
-;; [vstrwq_scatter_offset_p_s vstrwq_scatter_offset_p_u]
+;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_offset_p_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_p_<supf>v4si_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_offset_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")]
- VSTRWSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_offset_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_<supf>v4si_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")]
+ VSTRWSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrw.32\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrwq_scatter_shifted_offset_f]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")]
- VSTRWQSSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
- [(set_attr "length" "4")])
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_shifted_offset_fv4sf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")]
+ VSTRWQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
+ [(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_p_f]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWQSSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_p_s vstrwq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWSSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_s vstrwq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")]
- VSTRWSSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")]
+ VSTRWSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "4")])
;;
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index c57ad73..9e9bca4 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -37,6 +37,12 @@
&& mve_vector_mem_operand (GET_MODE (op), XEXP (op, 0),
false)")))
+(define_predicate "mve_scatter_memory"
+ (and (match_code "mem")
+ (match_test "TARGET_HAVE_MVE && REG_P (XEXP (op, 0))
+ && mve_vector_mem_operand (GET_MODE (op), XEXP (op, 0),
+ false)")))
+
;; True for immediates in the range of 1 to 16 for MVE.
(define_predicate "mve_imm_16"
(match_test "satisfies_constraint_Rd (op)"))
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c
new file mode 100644
index 0000000..21b9e12d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_s32 (pDataDest, 4, value);
+ vstrwq_scatter_base_s32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_u32 (pDataDest, 4, value);
+ vstrwq_scatter_base_u32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_f32 (pDataDest, 4, value);
+ vstrwq_scatter_base_f32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_s64 (pDataDest, 256, value);
+ vstrdq_scatter_base_s64 (pDataDest, 512, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_u64 (pDataDest, 256, value);
+ vstrdq_scatter_base_u64 (pDataDest, 512, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c
new file mode 100644
index 0000000..15c6496
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c
@@ -0,0 +1,69 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+
+int
+foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_s32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_s32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_u32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_u32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_f32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_f32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_p_s64 (pDataDest, 256, value, __p);
+ vstrdq_scatter_base_p_s64 (pDataDest, 512, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_p_u64 (pDataDest, 256, value, __p);
+ vstrdq_scatter_base_p_u64 (pDataDest, 512, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c
new file mode 100644
index 0000000..6d12366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c
@@ -0,0 +1,215 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_u8 (pDataDest, vecOffs1, (uint8x16_t) vecIn1);
+ vstrbq_scatter_offset_u8 (pDataDest, vecOffs2, (uint8x16_t) vecIn2);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
+ vstrbq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrbq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foobs8( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1);
+ vstrbq_scatter_offset_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobs16( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
+ vstrbq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrbq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
+ vstrhq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrhq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
+ vstrhq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrhq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1);
+ vstrhq_scatter_offset_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrwq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrwq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrwq_scatter_offset_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1);
+ vstrwq_scatter_offset_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1);
+ vstrdq_scatter_offset_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+int
+foows64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1);
+ vstrdq_scatter_offset_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 32 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c
new file mode 100644
index 0000000..cd2e1ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c
@@ -0,0 +1,216 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+int
+foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_p_u8(pDataDest, vecOffs1, (uint8x16_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u8(pDataDest, vecOffs2, (uint8x16_t) vecIn2, __p);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foobs8( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2, __p);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobs16( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1, __p);
+ vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2, __p);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+int
+foows64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1, __p);
+ vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2, __p);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 32 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c
new file mode 100644
index 0000000..62dfb45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c
@@ -0,0 +1,141 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ float32x4_t vecIn1 = vldrwq_f32 ((float32_t const *) pDataSrc);
+ float32x4_t vecIn2 = vldrwq_f32 ((float32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ uint16x8_t vecIn1 = vldrhq_u16 ((uint16_t const *) pDataSrc);
+ uint16x8_t vecIn2 = vldrhq_u16 ((uint16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrhq_u32 ((uint16_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrhq_u32 ((uint16_t const *) &pDataSrc[4]);
+ vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ float16x8_t vecIn1 = vldrhq_f16 ((float16_t const *) pDataSrc);
+ float16x8_t vecIn2 = vldrhq_f16 ((float16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+
+ vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1);
+ vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ int16x8_t vecIn1 = vldrhq_s16 ((int16_t const *) pDataSrc);
+ int16x8_t vecIn2 = vldrhq_s16 ((int16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrhq_s32 ((int16_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrhq_s32 ((int16_t const *) &pDataSrc[4]);
+ vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foods64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[2]);
+
+ vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1);
+ vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c
new file mode 100644
index 0000000..a51d3a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c
@@ -0,0 +1,142 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ float32x4_t vecIn1 = vldrwq_z_f32 ((float32_t const *) pDataSrc, __p);
+ float32x4_t vecIn2 = vldrwq_z_f32 ((float32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ uint16x8_t vecIn1 = vldrhq_z_u16 ((uint16_t const *) pDataSrc, __p);
+ uint16x8_t vecIn2 = vldrhq_z_u16 ((uint16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrhq_z_u32 ((uint16_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrhq_z_u32 ((uint16_t const *) &pDataSrc[4], __p);
+ vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ float16x8_t vecIn1 = vldrhq_z_f16 ((float16_t const *) pDataSrc, __p);
+ float16x8_t vecIn2 = vldrhq_z_f16 ((float16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[2], __p);
+
+ vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1, __p);
+ vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2, __p);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ int16x8_t vecIn1 = vldrhq_z_s16 ((int16_t const *) pDataSrc, __p);
+ int16x8_t vecIn2 = vldrhq_z_s16 ((int16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrhq_z_s32 ((int16_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrhq_z_s32 ((int16_t const *) &pDataSrc[4], __p);
+ vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foods64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[2], __p);
+
+ vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1, __p);
+ vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2, __p);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */