diff options
author | David Green <david.green@arm.com> | 2022-02-06 16:17:06 +0000 |
---|---|---|
committer | David Green <david.green@arm.com> | 2022-02-06 16:17:06 +0000 |
commit | b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98 (patch) | |
tree | ea1b88c7628edd736b3e69160950eff790aacab5 | |
parent | 3dff4f5cfb461cd25e2c95f2a8e9511f266e5dee (diff) | |
download | llvm-b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98.zip llvm-b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98.tar.gz llvm-b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98.tar.bz2 |
[ARM] Mark i64 and f64 shuffles as Custom for MVE
This way they get lowered through the ARMISD::BUILD_VECTOR, which can
produce more efficient D register moves.
Also helps D115653 not get stuck in a loop.
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 187 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst2-post.ll | 19 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst2.ll | 53 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst3.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst4-post.ll | 30 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-vst4.ll | 24 |
8 files changed, 248 insertions, 92 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 32cb88b..ce62979 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -392,6 +392,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index e7e8382..92ed928 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -1474,6 +1474,189 @@ entry: ret <2 x double> %out } +define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) { +; CHECK-LABEL: shuffle4_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0> + ret <4 x double> %out +} +define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) { +; CHECK-LABEL: shuffle5_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x double> %out +} +define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) { +; CHECK-LABEL: shuffle6_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3> + ret <2 x double> %out +} +define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) { +; CHECK-LABEL: shuffle7_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1> + ret <2 x double> %out +} +define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) { +; CHECK-LABEL: shuffle8_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1> + ret <2 x double> %out +} +define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { +; CHECK-LABEL: shuffle9_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s20, s2 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> + ret <8 x double> %out +} + + + + +define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: shuffle4_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0> + ret <4 x i64> %out +} +define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: shuffle5_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x i64> %out +} +define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: shuffle6_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %out +} +define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: shuffle7_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1> + ret <2 x i64> %out +} +define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: shuffle8_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1> + ret <2 x i64> %out +} +define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { +; CHECK-LABEL: shuffle9_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s20, s2 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> + ret <8 x i64> %out +} + define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) { ; CHECK-LABEL: insert_i32: @@ -1548,7 +1731,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: adr r2, .LCPI76_0 +; CHECK-NEXT: adr r2, .LCPI88_0 ; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: mov r1, sp @@ -1558,7 +1741,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) { ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI76_0: +; CHECK-NEXT: .LCPI88_0: ; CHECK-NEXT: .zero 4 ; CHECK-NEXT: .long 7 @ 0x7 ; CHECK-NEXT: .long 1 @ 0x1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll index b8ddde7..690c017 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -149,8 +149,8 @@ define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i ; CHECK-LABEL: vmovn64_b2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr @@ -164,8 +164,8 @@ define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i ; CHECK-LABEL: vmovn64_b3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll index eafbf41..d482fee 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll @@ -72,17 +72,14 @@ entry: define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vst2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f64 d5, d0 +; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1], #32 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll index c749b36..bcddeae 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -327,14 +327,11 @@ define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.f64 d4, d3 +; CHECK-NEXT: vmov.f64 d5, d1 +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 @@ -349,29 +346,23 @@ entry: define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) { ; CHECK-LABEL: vst2_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vstrb.8 q1, [r1], #48 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vstrw.32 q5, [r1, #-32] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d4, d6 +; CHECK-NEXT: vmov.f64 d2, d7 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 7d4763f..d3e042a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -916,20 +916,16 @@ entry: define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-LABEL: vst3_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vstrb.8 q1, [r1], #32 -; CHECK-NEXT: vmov.f32 s12, s10 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vstrw.32 q0, [r1, #-16] -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f64 d7, d1 +; CHECK-NEXT: vmov.f64 d1, d4 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d2, d5 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll index ee1fe9e..5fe7f2f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -105,25 +105,19 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s8, s16 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f64 d0, d7 +; CHECK-NEXT: vmov.f64 d7, d4 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f32 s5, s15 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vstrw.32 q2, [r1], #64 +; CHECK-NEXT: vmov.f64 d6, d8 +; CHECK-NEXT: vmov.f64 d4, d9 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1], #64 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index db4a438..b76a97d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -719,24 +719,18 @@ define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov.f32 s15, s1 -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmov.f32 s12, s16 -; CHECK-NEXT: vmov.f32 s13, s17 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vmov.f32 s0, s18 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d11, d2 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f64 d2, d7 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr |