diff options
author | Gaëtan Bossu <gaetan.bossu@arm.com> | 2025-08-01 11:32:58 +0000 |
---|---|---|
committer | Gaëtan Bossu <gaetan.bossu@arm.com> | 2025-08-01 16:43:33 +0000 |
commit | f573d2e983e34a2f99a37976d4956e7aa7c62acd (patch) | |
tree | dc5102ed171608943b1993eec7ab168b588bb868 | |
parent | ebcb4929004ae3f08b2ca3d5d246f29aa73600e1 (diff) | |
download | llvm-users/gbossu.vector.extract.2.zip llvm-users/gbossu.vector.extract.2.tar.gz llvm-users/gbossu.vector.extract.2.tar.bz2 |
[AArch64][ISel] Select constructive SVE2 ext instructionusers/gbossu.vector.extract.2
This adds patterns for selecting EXT_ZZI_B.
They are tested for fixed vectors using extract shuffles, and for
scalable vectors using llvm.vector.splice intrinsics.
We will get better codegen when enabling subreg liveness. Without it,
any use of a zpr2 tuple is always considered as using both zpr registers
of the pair.
14 files changed, 2236 insertions, 1390 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 0c4b4f4..201dd93 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4069,6 +4069,22 @@ let Predicates = [HasSVE2_or_SME] in { let AddedComplexity = 2 in { def : Pat<(nxv16i8 (AArch64ext nxv16i8:$zn1, nxv16i8:$zn2, (i32 imm0_255:$imm))), (EXT_ZZI_B (REG_SEQUENCE ZPR2, $zn1, zsub0, $zn2, zsub1), imm0_255:$imm)>; + + foreach VT = [nxv16i8] in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))), + (EXT_ZZI_B (REG_SEQUENCE ZPR2, $Z1, zsub0, $Z2, zsub1), imm0_255:$index)>; + + foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))), + (EXT_ZZI_B (REG_SEQUENCE ZPR2, $Z1, zsub0, $Z2, zsub1), imm0_255:$index)>; + + foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))), + (EXT_ZZI_B (REG_SEQUENCE ZPR2, $Z1, zsub0, $Z2, zsub1), imm0_255:$index)>; + + foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))), + (EXT_ZZI_B (REG_SEQUENCE ZPR2, $Z1, zsub0, $Z2, zsub1), imm0_255:$index)>; } } // End HasSVE2_or_SME diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll index 50975d1..13bec60 100644 --- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll +++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll @@ -192,7 +192,7 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 { ; CHECK-SVE2p1-NEXT: mov z1.s, p0/z, #1 // =0x1 ; CHECK-SVE2p1-NEXT: fmov s0, w8 ; CHECK-SVE2p1-NEXT: mov v0.s[1], v1.s[1] -; CHECK-SVE2p1-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-SVE2p1-NEXT: ext z1.b, { z1.b, z2.b }, #8 ; CHECK-SVE2p1-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SVE2p1-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-SVE2p1-NEXT: b use @@ -202,12 +202,12 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 { ; CHECK-SME2-NEXT: whilelo p0.s, x0, x1 ; CHECK-SME2-NEXT: cset w8, mi ; CHECK-SME2-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-SME2-NEXT: fmov s2, w8 +; CHECK-SME2-NEXT: fmov s3, w8 ; CHECK-SME2-NEXT: mov z0.s, z1.s[1] -; CHECK-SME2-NEXT: zip1 z0.s, z2.s, z0.s -; CHECK-SME2-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-SME2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-SME2-NEXT: ext z1.b, { z1.b, z2.b }, #8 ; CHECK-SME2-NEXT: // kill: def $d1 killed $d1 killed $z1 +; CHECK-SME2-NEXT: zip1 z0.s, z3.s, z0.s +; CHECK-SME2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-SME2-NEXT: b use %r = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n) %v0 = call <2 x i1> @llvm.vector.extract.v2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 0) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll index 33d5ac4..3e8b3a4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll @@ -109,14 +109,13 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-LABEL: two_way_i8_i16_vl256: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: umlalb z0.h, z2.b, z1.b -; SME-NEXT: umlalt z0.h, z2.b, z1.b -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: umlalb z0.h, z3.b, z2.b +; SME-NEXT: umlalt z0.h, z3.b, z2.b +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <16 x i16>, ptr %accptr %u = load <32 x i8>, ptr %uptr @@ -232,14 +231,13 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-LABEL: two_way_i16_i32_vl256: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: umlalb z0.s, z2.h, z1.h -; SME-NEXT: umlalt z0.s, z2.h, z1.h -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: umlalb z0.s, z3.h, z2.h +; SME-NEXT: umlalt z0.s, z3.h, z2.h +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <8 x i32>, ptr %accptr %u = load <16 x i16>, ptr %uptr @@ -355,14 +353,13 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-LABEL: two_way_i32_i64_vl256: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: umlalb z0.d, z2.s, z1.s -; SME-NEXT: umlalt z0.d, z2.s, z1.s -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: umlalb z0.d, z3.s, z2.s +; SME-NEXT: umlalt z0.d, z3.s, z2.s +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <4 x i64>, ptr %accptr %u = load <8 x i32>, ptr %uptr @@ -644,13 +641,12 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-LABEL: four_way_i8_i32_vl256: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: udot z0.s, z2.b, z1.b -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: udot z0.s, z3.b, z2.b +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <8 x i32>, ptr %accptr %u = load <32 x i8>, ptr %uptr @@ -689,13 +685,12 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) ; SME-LABEL: four_way_i8_i32_vl256_usdot: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: usdot z0.s, z1.b, z2.b -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: usdot z0.s, z2.b, z3.b +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <8 x i32>, ptr %accptr %u = load <32 x i8>, ptr %uptr @@ -822,13 +817,12 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca ; SME-LABEL: four_way_i16_i64_vl256: ; SME: // %bb.0: ; SME-NEXT: ldr z0, [x0] -; SME-NEXT: ldr z1, [x1] -; SME-NEXT: ldr z2, [x2] -; SME-NEXT: udot z0.d, z2.h, z1.h -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ldr z2, [x1] +; SME-NEXT: ldr z3, [x2] +; SME-NEXT: udot z0.d, z3.h, z2.h +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <4 x i64>, ptr %accptr %u = load <16 x i16>, ptr %uptr @@ -999,10 +993,9 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z0, [x0] ; SME-NEXT: uaddwb z0.d, z0.d, z2.s ; SME-NEXT: uaddwt z0.d, z0.d, z2.s -; SME-NEXT: mov z1.d, z0.d -; SME-NEXT: ext z1.b, z1.b, z0.b, #16 -; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 -; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SME-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; SME-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 +; SME-NEXT: mov z1.d, z2.d ; SME-NEXT: ret %acc = load <4 x i64>, ptr %accptr %u = load <32 x i8>, ptr %uptr diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll index 3f34d79..427d390 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll @@ -5,16 +5,15 @@ define void @main(ptr %0) { ; CHECK-LABEL: main: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uzp1 v0.2s, v1.2s, v0.2s -; CHECK-NEXT: neg v0.2s, v0.2s -; CHECK-NEXT: smov x8, v0.s[0] -; CHECK-NEXT: smov x9, v0.s[1] -; CHECK-NEXT: mov z1.d, p0/m, x8 -; CHECK-NEXT: mov z1.d, p0/m, x9 -; CHECK-NEXT: str z1, [x0] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: uzp1 v2.2s, v0.2s, v2.2s +; CHECK-NEXT: neg v2.2s, v2.2s +; CHECK-NEXT: smov x8, v2.s[0] +; CHECK-NEXT: smov x9, v2.s[1] +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: mov z0.d, p0/m, x9 +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret "entry": %1 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 4d524bc..6fe6b8a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -228,25 +228,25 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; CHECK-LABEL: load_sext_v4i32i256: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov z1.d, z1.d[1] -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.d, z2.d[1] ; CHECK-NEXT: asr x10, x9, #63 +; CHECK-NEXT: fmov x11, d2 ; CHECK-NEXT: stp x9, x10, [x8] -; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: asr x12, x11, #63 ; CHECK-NEXT: stp x10, x10, [x8, #16] -; CHECK-NEXT: stp x11, x12, [x8, #64] +; CHECK-NEXT: stp x11, x12, [x8, #32] ; CHECK-NEXT: fmov x11, d0 ; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: stp x12, x12, [x8, #80] -; CHECK-NEXT: stp x10, x10, [x8, #48] +; CHECK-NEXT: stp x12, x12, [x8, #48] +; CHECK-NEXT: stp x10, x10, [x8, #80] ; CHECK-NEXT: asr x12, x11, #63 -; CHECK-NEXT: stp x9, x10, [x8, #32] +; CHECK-NEXT: stp x9, x10, [x8, #64] ; CHECK-NEXT: stp x12, x12, [x8, #112] ; CHECK-NEXT: stp x11, x12, [x8, #96] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index 35dd827..7ef35f1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -78,8 +78,8 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) { ; CHECK-LABEL: extract_subvector_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -119,7 +119,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -138,8 +138,8 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) { define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) { ; CHECK-LABEL: extract_subvector_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -198,8 +198,8 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) { define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) { ; CHECK-LABEL: extract_subvector_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -237,8 +237,8 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) { define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) { ; CHECK-LABEL: extract_subvector_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -297,8 +297,8 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) { define <4 x half> @extract_subvector_v8f16(<8 x half> %op) { ; CHECK-LABEL: extract_subvector_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -357,8 +357,8 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) { define <2 x float> @extract_subvector_v4f32(<4 x float> %op) { ; CHECK-LABEL: extract_subvector_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -396,8 +396,8 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) { define <1 x double> @extract_subvector_v2f64(<2 x double> %op) { ; CHECK-LABEL: extract_subvector_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index e3d0a72..bc9b037 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -74,14 +74,14 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v8f16_to_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z1.s, z2.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32: @@ -122,21 +122,21 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v16f16_to_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: ext z0.b, { z1.b, z2.b }, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ext z5.b, { z3.b, z4.b }, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvt z2.s, p0/m, z2.h -; CHECK-NEXT: fcvt z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z2.s, z3.h ; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z3.s, z5.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h -; CHECK-NEXT: stp q3, q0, [x0] -; CHECK-NEXT: stp q2, q1, [x0, #32] +; CHECK-NEXT: fcvt z2.s, p0/m, z2.h +; CHECK-NEXT: fcvt z3.s, p0/m, z3.h +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index ae7c676..0e34b2c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -58,21 +58,21 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: sunpklo z0.s, z1.h +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z3.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -124,40 +124,40 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-LABEL: sdiv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z2.h, z1.b -; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: sunpklo z5.h, z1.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: sunpklo z16.h, z3.b +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sunpklo z0.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: sunpklo z3.h, z1.b +; CHECK-NEXT: sunpklo z1.h, z2.b +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: sunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sunpklo z2.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } -; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } -; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z6.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -241,73 +241,73 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q6, q3, [x1] +; CHECK-NEXT: ldp q18, q4, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q2, [x0, #16] -; CHECK-NEXT: sunpklo z1.h, z3.b -; CHECK-NEXT: sunpklo z4.h, z2.b -; CHECK-NEXT: sunpklo z7.h, z6.b -; CHECK-NEXT: sunpklo z0.s, z1.h -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: sunpklo z17.s, z7.h -; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: sunpklo z6.h, z4.b +; CHECK-NEXT: sunpklo z16.h, z2.b +; CHECK-NEXT: ext z4.b, { z4.b, z5.b }, #8 +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: sunpklo z20.h, z18.b +; CHECK-NEXT: ext z18.b, { z18.b, z19.b }, #8 +; CHECK-NEXT: sunpklo z3.h, z4.b +; CHECK-NEXT: sunpklo z0.s, z6.h +; CHECK-NEXT: sunpklo z1.s, z16.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ext z1.b, { z6.b, z7.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ldr q16, [x0] ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sunpklo z6.h, z6.b -; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: sunpklo z4.h, z2.b +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z22.h, z16.b +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sunpklo z17.h, z18.b +; CHECK-NEXT: sunpklo z24.s, z22.h +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: sunpklo z5.h, z2.b ; CHECK-NEXT: sunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z5.h +; CHECK-NEXT: ext z4.b, { z5.b, z6.b }, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: sunpklo z16.h, z5.b -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: sunpklo z5.h, z5.b -; CHECK-NEXT: sunpklo z18.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: sunpklo z7.s, z20.h +; CHECK-NEXT: ext z20.b, { z20.b, z21.b }, #8 +; CHECK-NEXT: ext z21.b, { z22.b, z23.b }, #8 +; CHECK-NEXT: sunpklo z20.s, z20.h +; CHECK-NEXT: sunpklo z21.s, z21.h +; CHECK-NEXT: sdivr z20.s, p0/m, z20.s, z21.s +; CHECK-NEXT: sunpklo z21.h, z16.b +; CHECK-NEXT: sunpklo z16.s, z17.h +; CHECK-NEXT: ext z17.b, { z17.b, z18.b }, #8 +; CHECK-NEXT: ext z18.b, { z21.b, z22.b }, #8 +; CHECK-NEXT: sunpklo z19.s, z21.h +; CHECK-NEXT: sunpklo z17.s, z17.h +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z24.s +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z19.s +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z7.h, z20.h, z20.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: sunpklo z18.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: sunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s -; CHECK-NEXT: uzp1 z18.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z19.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z18.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h -; CHECK-NEXT: splice z2.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h +; CHECK-NEXT: splice z2.h, p0, { z6.h, z7.h } +; CHECK-NEXT: uzp1 z1.h, z17.h, z17.h ; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: splice z3.h, p0, { z18.h, z19.h } -; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z19.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z4.h, z5.h } ; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: splice z1.h, p0, { z18.h, z19.h } ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z4.b, z5.b } +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z1.b, p0, { z2.b, z3.b } ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -534,21 +534,21 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-LABEL: sdiv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: sunpklo z0.s, z1.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z3.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -598,33 +598,33 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q16, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: ldr q5, [x0] +; CHECK-NEXT: sunpklo z1.s, z2.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h +; CHECK-NEXT: sunpklo z18.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: sunpklo z6.s, z16.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z7.h, z7.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } -; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z2.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: splice z1.h, p0, { z3.h, z4.h } ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; @@ -972,21 +972,21 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: uunpklo z0.s, z1.h +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z3.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1038,40 +1038,40 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-LABEL: udiv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.h, z1.b -; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: uunpklo z5.h, z1.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: uunpklo z16.h, z3.b +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: uunpklo z0.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: uunpklo z1.h, z2.b +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: uunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: uunpklo z2.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } -; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } -; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z6.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1155,73 +1155,73 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q6, q3, [x1] +; CHECK-NEXT: ldp q18, q4, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q2, [x0, #16] -; CHECK-NEXT: uunpklo z1.h, z3.b -; CHECK-NEXT: uunpklo z4.h, z2.b -; CHECK-NEXT: uunpklo z7.h, z6.b -; CHECK-NEXT: uunpklo z0.s, z1.h -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: uunpklo z17.s, z7.h -; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: uunpklo z6.h, z4.b +; CHECK-NEXT: uunpklo z16.h, z2.b +; CHECK-NEXT: ext z4.b, { z4.b, z5.b }, #8 +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: uunpklo z20.h, z18.b +; CHECK-NEXT: ext z18.b, { z18.b, z19.b }, #8 +; CHECK-NEXT: uunpklo z3.h, z4.b +; CHECK-NEXT: uunpklo z0.s, z6.h +; CHECK-NEXT: uunpklo z1.s, z16.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ext z1.b, { z6.b, z7.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ldr q16, [x0] ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uunpklo z6.h, z6.b -; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: uunpklo z4.h, z2.b +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z22.h, z16.b +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: uunpklo z17.h, z18.b +; CHECK-NEXT: uunpklo z24.s, z22.h +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: uunpklo z5.h, z2.b ; CHECK-NEXT: uunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z5.h +; CHECK-NEXT: ext z4.b, { z5.b, z6.b }, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: uunpklo z16.h, z5.b -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: uunpklo z5.h, z5.b -; CHECK-NEXT: uunpklo z18.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: uunpklo z7.s, z20.h +; CHECK-NEXT: ext z20.b, { z20.b, z21.b }, #8 +; CHECK-NEXT: ext z21.b, { z22.b, z23.b }, #8 +; CHECK-NEXT: uunpklo z20.s, z20.h +; CHECK-NEXT: uunpklo z21.s, z21.h +; CHECK-NEXT: udivr z20.s, p0/m, z20.s, z21.s +; CHECK-NEXT: uunpklo z21.h, z16.b +; CHECK-NEXT: uunpklo z16.s, z17.h +; CHECK-NEXT: ext z17.b, { z17.b, z18.b }, #8 +; CHECK-NEXT: ext z18.b, { z21.b, z22.b }, #8 +; CHECK-NEXT: uunpklo z19.s, z21.h +; CHECK-NEXT: uunpklo z17.s, z17.h +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z24.s +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z19.s +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z7.h, z20.h, z20.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: uunpklo z18.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: uunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s -; CHECK-NEXT: uzp1 z18.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z19.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z18.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h -; CHECK-NEXT: splice z2.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h +; CHECK-NEXT: splice z2.h, p0, { z6.h, z7.h } +; CHECK-NEXT: uzp1 z1.h, z17.h, z17.h ; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: splice z3.h, p0, { z18.h, z19.h } -; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z19.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z4.h, z5.h } ; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: splice z1.h, p0, { z18.h, z19.h } ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z4.b, z5.b } +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b ; CHECK-NEXT: splice z1.b, p0, { z2.b, z3.b } ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -1448,21 +1448,21 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-LABEL: udiv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: uunpklo z0.s, z1.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: ext z2.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z3.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1512,33 +1512,33 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q16, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: ldr q5, [x0] +; CHECK-NEXT: uunpklo z1.s, z2.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h +; CHECK-NEXT: uunpklo z18.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: uunpklo z6.s, z16.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z7.h, z7.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } -; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z2.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: splice z1.h, p0, { z3.h, z4.h } ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index b022c19..eb8d612 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -14,19 +14,33 @@ target triple = "aarch64-unknown-linux-gnu" ; type's element type is not byte based and thus cannot be lowered directly to ; an SVE instruction. define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { -; CHECK-LABEL: sext_v8i1_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 -; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 -; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i1_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: lsl z1.s, z1.s, #31 +; SVE-NEXT: lsl z0.s, z0.s, #31 +; SVE-NEXT: asr z1.s, z1.s, #31 +; SVE-NEXT: asr z0.s, z0.s, #31 +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i1_v8i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z1.s, z2.h +; SVE2-NEXT: lsl z0.s, z0.s, #31 +; SVE2-NEXT: lsl z1.s, z1.s, #31 +; SVE2-NEXT: asr z0.s, z0.s, #31 +; SVE2-NEXT: asr z1.s, z1.s, #31 +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32: ; NONEON-NOSVE: // %bb.0: @@ -70,19 +84,33 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { ; type's element type is not power-of-2 based and thus cannot be lowered ; directly to an SVE instruction. define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { -; CHECK-LABEL: sext_v4i3_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: lsl z1.d, z1.d, #61 -; CHECK-NEXT: lsl z0.d, z0.d, #61 -; CHECK-NEXT: asr z1.d, z1.d, #61 -; CHECK-NEXT: asr z0.d, z0.d, #61 -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v4i3_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: lsl z1.d, z1.d, #61 +; SVE-NEXT: lsl z0.d, z0.d, #61 +; SVE-NEXT: asr z1.d, z1.d, #61 +; SVE-NEXT: asr z0.d, z0.d, #61 +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v4i3_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.d, z2.s +; SVE2-NEXT: lsl z0.d, z0.d, #61 +; SVE2-NEXT: lsl z1.d, z1.d, #61 +; SVE2-NEXT: asr z0.d, z0.d, #61 +; SVE2-NEXT: asr z1.d, z1.d, #61 +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -113,14 +141,23 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { ; define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v16i8_v16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v16i8_v16i16: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v16i8_v16i16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: sunpklo z1.h, z2.b +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: @@ -171,20 +208,35 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; NOTE: Extra 'add' is to prevent the extend being combined with the load. define void @sext_v32i8_v32i16(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v32i8_v32i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: sunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v32i8_v32i16: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: sunpklo z2.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z1.h, z1.b +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v32i8_v32i16: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z2.h, z2.b +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: sunpklo z3.h, z4.b +; SVE2-NEXT: sunpklo z1.h, z5.b +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: @@ -365,15 +417,25 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v8i8_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i8_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i8_v8i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z1.s, z2.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: @@ -402,21 +464,37 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { } define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v16i8_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v16i8_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z2.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v16i8_v16i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: sunpklo z2.h, z2.b +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: sunpklo z1.s, z4.h +; SVE2-NEXT: sunpklo z3.s, z5.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: @@ -460,34 +538,63 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { } define void @sext_v32i8_v32i32(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v32i8_v32i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: sunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.s, z1.h -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v32i8_v32i32: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: sunpklo z2.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z4.s, z2.h +; SVE-NEXT: sunpklo z5.s, z3.h +; SVE-NEXT: sunpklo z1.h, z1.b +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: sunpklo z6.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z7.s, z1.h +; SVE-NEXT: sunpklo z2.s, z2.h +; SVE-NEXT: sunpklo z3.s, z3.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: stp q5, q3, [x1] +; SVE-NEXT: stp q4, q2, [x1, #64] +; SVE-NEXT: stp q6, q0, [x1, #96] +; SVE-NEXT: stp q7, q1, [x1, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v32i8_v32i32: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.h, z2.b +; SVE2-NEXT: sunpklo z5.h, z0.b +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z16.h, z4.b +; SVE2-NEXT: ext z1.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z18.h, z0.b +; SVE2-NEXT: ext z0.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: sunpklo z3.s, z5.h +; SVE2-NEXT: sunpklo z1.s, z1.h +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: ext z4.b, { z16.b, z17.b }, #8 +; SVE2-NEXT: ext z5.b, { z18.b, z19.b }, #8 +; SVE2-NEXT: sunpklo z6.s, z16.h +; SVE2-NEXT: stp q3, q0, [x1] +; SVE2-NEXT: sunpklo z3.s, z18.h +; SVE2-NEXT: stp q2, q1, [x1, #64] +; SVE2-NEXT: sunpklo z2.s, z4.h +; SVE2-NEXT: sunpklo z1.s, z5.h +; SVE2-NEXT: stp q3, q1, [x1, #32] +; SVE2-NEXT: stp q6, q2, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: @@ -659,18 +766,31 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; extend is a two step process where the container is any_extend'd with the ; result feeding an inreg sign extend. define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v4i8_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sxtb z1.d, p0/m, z1.d -; CHECK-NEXT: sxtb z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v4i8_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: sxtb z1.d, p0/m, z1.d +; SVE-NEXT: sxtb z0.d, p0/m, z0.d +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v4i8_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: ptrue p0.d, vl2 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.d, z2.s +; SVE2-NEXT: sxtb z0.d, p0/m, z0.d +; SVE2-NEXT: sxtb z1.d, p0/m, z1.d +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -695,22 +815,39 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { } define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v8i8_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i8_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z2.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i8_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z1.d, z4.s +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z3.d, z5.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: @@ -741,35 +878,65 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { } define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: sext_v16i8_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x0] -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q6, q1, [x0, #32] -; CHECK-NEXT: stp q5, q3, [x0, #64] -; CHECK-NEXT: stp q7, q0, [x0, #96] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v16i8_v16i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z2.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z4.d, z2.s +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z5.d, z3.s +; SVE-NEXT: sunpklo z2.d, z2.s +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: sunpklo z6.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z7.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.d, z3.s +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: stp q4, q2, [x0] +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q6, q1, [x0, #32] +; SVE-NEXT: stp q5, q3, [x0, #64] +; SVE-NEXT: stp q7, q0, [x0, #96] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v16i8_v16i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: sunpklo z2.h, z2.b +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z5.s, z2.h +; SVE2-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z3.s, z4.h +; SVE2-NEXT: ext z7.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: sunpklo z1.s, z2.h +; SVE2-NEXT: ext z16.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: sunpklo z5.d, z5.s +; SVE2-NEXT: sunpklo z7.d, z7.s +; SVE2-NEXT: ext z6.b, { z3.b, z4.b }, #8 +; SVE2-NEXT: sunpklo z3.d, z3.s +; SVE2-NEXT: sunpklo z16.d, z16.s +; SVE2-NEXT: sunpklo z4.d, z6.s +; SVE2-NEXT: stp q0, q7, [x0] +; SVE2-NEXT: ext z0.b, { z1.b, z2.b }, #8 +; SVE2-NEXT: sunpklo z1.d, z1.s +; SVE2-NEXT: stp q5, q16, [x0, #64] +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: stp q3, q4, [x0, #32] +; SVE2-NEXT: stp q1, q0, [x0, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: @@ -817,67 +984,125 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { } define void @sext_v32i8_v32i64(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v32i8_v32i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z4.s, z3.h -; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: sunpklo z16.d, z4.s -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z6.s, z2.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z5.d, z5.s -; CHECK-NEXT: sunpklo z20.d, z1.s -; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sunpklo z18.d, z6.s -; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: sunpklo z19.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: sunpklo z16.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q5, q17, [x1] -; CHECK-NEXT: sunpklo z5.d, z6.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: stp q16, q0, [x1, #32] -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: stp q18, q5, [x1, #64] -; CHECK-NEXT: sunpklo z1.d, z4.s -; CHECK-NEXT: sunpklo z3.d, z6.s -; CHECK-NEXT: stp q7, q1, [x1, #224] -; CHECK-NEXT: stp q2, q3, [x1, #96] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v32i8_v32i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q0, q1, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: mov z2.d, z0.d +; SVE-NEXT: sunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.h, z0.b +; SVE-NEXT: sunpklo z1.h, z1.b +; SVE-NEXT: sunpklo z4.s, z3.h +; SVE-NEXT: sunpklo z2.h, z2.b +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: sunpklo z5.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: mov z7.d, z1.d +; SVE-NEXT: sunpklo z16.d, z4.s +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: sunpklo z6.s, z2.h +; SVE-NEXT: ext z4.b, z4.b, z0.b, #8 +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z7.b, z7.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: mov z17.d, z5.d +; SVE-NEXT: sunpklo z3.s, z3.h +; SVE-NEXT: sunpklo z5.d, z5.s +; SVE-NEXT: sunpklo z20.d, z1.s +; SVE-NEXT: sunpklo z4.d, z4.s +; SVE-NEXT: sunpklo z2.s, z2.h +; SVE-NEXT: sunpklo z7.s, z7.h +; SVE-NEXT: sunpklo z18.d, z6.s +; SVE-NEXT: ext z17.b, z17.b, z0.b, #8 +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: ext z6.b, z6.b, z0.b, #8 +; SVE-NEXT: sunpklo z19.d, z3.s +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: stp q16, q4, [x1, #128] +; SVE-NEXT: sunpklo z16.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z17.d, z17.s +; SVE-NEXT: mov z4.d, z7.d +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: sunpklo z3.d, z3.s +; SVE-NEXT: sunpklo z7.d, z7.s +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q5, q17, [x1] +; SVE-NEXT: sunpklo z5.d, z6.s +; SVE-NEXT: mov z6.d, z2.d +; SVE-NEXT: stp q19, q3, [x1, #160] +; SVE-NEXT: sunpklo z2.d, z2.s +; SVE-NEXT: ext z4.b, z4.b, z0.b, #8 +; SVE-NEXT: stp q16, q0, [x1, #32] +; SVE-NEXT: ext z6.b, z6.b, z0.b, #8 +; SVE-NEXT: stp q20, q1, [x1, #192] +; SVE-NEXT: stp q18, q5, [x1, #64] +; SVE-NEXT: sunpklo z1.d, z4.s +; SVE-NEXT: sunpklo z3.d, z6.s +; SVE-NEXT: stp q7, q1, [x1, #224] +; SVE-NEXT: stp q2, q3, [x1, #96] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v32i8_v32i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.h, z2.b +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.h, z0.b +; SVE2-NEXT: sunpklo z6.h, z4.b +; SVE2-NEXT: sunpklo z4.h, z5.b +; SVE2-NEXT: ext z16.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: sunpklo z17.s, z0.h +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z19.s, z6.h +; SVE2-NEXT: sunpklo z21.s, z16.h +; SVE2-NEXT: ext z6.b, { z6.b, z7.b }, #8 +; SVE2-NEXT: ext z7.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z23.s, z0.h +; SVE2-NEXT: ext z0.b, { z17.b, z18.b }, #8 +; SVE2-NEXT: sunpklo z16.d, z17.s +; SVE2-NEXT: ext z1.b, { z4.b, z5.b }, #8 +; SVE2-NEXT: sunpklo z4.s, z4.h +; SVE2-NEXT: sunpklo z3.d, z19.s +; SVE2-NEXT: ext z17.b, { z19.b, z20.b }, #8 +; SVE2-NEXT: sunpklo z19.s, z6.h +; SVE2-NEXT: ext z6.b, { z21.b, z22.b }, #8 +; SVE2-NEXT: sunpklo z18.d, z21.s +; SVE2-NEXT: sunpklo z7.d, z7.s +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: str q16, [x1] +; SVE2-NEXT: ext z21.b, { z4.b, z5.b }, #8 +; SVE2-NEXT: sunpklo z4.d, z4.s +; SVE2-NEXT: sunpklo z5.d, z17.s +; SVE2-NEXT: sunpklo z6.d, z6.s +; SVE2-NEXT: stp q2, q7, [x1, #128] +; SVE2-NEXT: sunpklo z2.d, z23.s +; SVE2-NEXT: stp q3, q5, [x1, #192] +; SVE2-NEXT: ext z3.b, { z23.b, z24.b }, #8 +; SVE2-NEXT: stp q18, q6, [x1, #160] +; SVE2-NEXT: sunpklo z17.s, z1.h +; SVE2-NEXT: sunpklo z1.d, z21.s +; SVE2-NEXT: stp q0, q2, [x1, #16] +; SVE2-NEXT: ext z2.b, { z19.b, z20.b }, #8 +; SVE2-NEXT: sunpklo z3.d, z3.s +; SVE2-NEXT: ext z0.b, { z17.b, z18.b }, #8 +; SVE2-NEXT: stp q4, q1, [x1, #64] +; SVE2-NEXT: sunpklo z4.d, z19.s +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z5.d, z17.s +; SVE2-NEXT: str q3, [x1, #48] +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: stp q4, q2, [x1, #224] +; SVE2-NEXT: stp q5, q0, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: @@ -1054,14 +1279,23 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { -; CHECK-LABEL: sext_v8i16_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i16_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i16_v8i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z1.s, z2.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: @@ -1091,20 +1325,35 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { } define void @sext_v16i16_v16i32(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v16i16_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: sunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v16i16_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.h, z0.h, z0.h +; SVE-NEXT: add z1.h, z1.h, z1.h +; SVE-NEXT: sunpklo z2.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v16i16_v16i32: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.h, z0.h, z0.h +; SVE2-NEXT: add z0.h, z1.h, z1.h +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z3.s, z4.h +; SVE2-NEXT: sunpklo z1.s, z5.h +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: @@ -1185,15 +1434,25 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { -; CHECK-LABEL: sext_v4i16_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v4i16_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v4i16_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: sunpklo z1.d, z2.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -1216,21 +1475,37 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { } define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { -; CHECK-LABEL: sext_v8i16_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i16_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z2.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i16_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.s, z0.h +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z1.d, z4.s +; SVE2-NEXT: sunpklo z3.d, z5.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: @@ -1262,34 +1537,63 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { } define void @sext_v16i16_v16i64(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v16i16_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: sunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.d, z1.s -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v16i16_v16i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.h, z0.h, z0.h +; SVE-NEXT: add z1.h, z1.h, z1.h +; SVE-NEXT: sunpklo z2.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.s, z0.h +; SVE-NEXT: sunpklo z4.d, z2.s +; SVE-NEXT: sunpklo z5.d, z3.s +; SVE-NEXT: sunpklo z1.s, z1.h +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: sunpklo z6.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z7.d, z1.s +; SVE-NEXT: sunpklo z2.d, z2.s +; SVE-NEXT: sunpklo z3.d, z3.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: stp q5, q3, [x1] +; SVE-NEXT: stp q4, q2, [x1, #64] +; SVE-NEXT: stp q6, q0, [x1, #96] +; SVE-NEXT: stp q7, q1, [x1, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v16i16_v16i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.h, z0.h, z0.h +; SVE2-NEXT: add z0.h, z1.h, z1.h +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z2.s, z2.h +; SVE2-NEXT: sunpklo z5.s, z0.h +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z16.s, z4.h +; SVE2-NEXT: ext z1.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: sunpklo z18.s, z0.h +; SVE2-NEXT: ext z0.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z3.d, z5.s +; SVE2-NEXT: sunpklo z1.d, z1.s +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: ext z4.b, { z16.b, z17.b }, #8 +; SVE2-NEXT: ext z5.b, { z18.b, z19.b }, #8 +; SVE2-NEXT: sunpklo z6.d, z16.s +; SVE2-NEXT: stp q3, q0, [x1] +; SVE2-NEXT: sunpklo z3.d, z18.s +; SVE2-NEXT: stp q2, q1, [x1, #64] +; SVE2-NEXT: sunpklo z2.d, z4.s +; SVE2-NEXT: sunpklo z1.d, z5.s +; SVE2-NEXT: stp q3, q1, [x1, #32] +; SVE2-NEXT: stp q6, q2, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: @@ -1375,14 +1679,23 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { -; CHECK-LABEL: sext_v4i32_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v4i32_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: sunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v4i32_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: sunpklo z1.d, z2.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -1404,20 +1717,35 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { } define void @sext_v8i32_v8i64(ptr %in, ptr %out) { -; CHECK-LABEL: sext_v8i32_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.s, z0.s, z0.s -; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: sext_v8i32_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.s, z0.s, z0.s +; SVE-NEXT: add z1.s, z1.s, z1.s +; SVE-NEXT: sunpklo z2.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: sunpklo z3.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: sunpklo z0.d, z0.s +; SVE-NEXT: sunpklo z1.d, z1.s +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: sext_v8i32_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.s, z0.s, z0.s +; SVE2-NEXT: add z0.s, z1.s, z1.s +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: sunpklo z2.d, z2.s +; SVE2-NEXT: sunpklo z0.d, z0.s +; SVE2-NEXT: sunpklo z3.d, z4.s +; SVE2-NEXT: sunpklo z1.d, z5.s +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: @@ -1466,14 +1794,23 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v16i8_v16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v16i8_v16i16: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v16i8_v16i16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: uunpklo z1.h, z2.b +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: @@ -1524,20 +1861,35 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; NOTE: Extra 'add' is to prevent the extend being combined with the load. define void @zext_v32i8_v32i16(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v32i8_v32i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: uunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v32i8_v32i16: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: uunpklo z2.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z1.h, z1.b +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v32i8_v32i16: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z2.h, z2.b +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: uunpklo z3.h, z4.b +; SVE2-NEXT: uunpklo z1.h, z5.b +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: @@ -1718,15 +2070,25 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v8i8_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v8i8_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v8i8_v8i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z1.s, z2.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: @@ -1755,21 +2117,37 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { } define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v16i8_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v16i8_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z2.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v16i8_v16i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: uunpklo z2.h, z2.b +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: uunpklo z1.s, z4.h +; SVE2-NEXT: uunpklo z3.s, z5.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: @@ -1813,34 +2191,63 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { } define void @zext_v32i8_v32i32(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v32i8_v32i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: uunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.s, z1.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v32i8_v32i32: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: uunpklo z2.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z4.s, z2.h +; SVE-NEXT: uunpklo z5.s, z3.h +; SVE-NEXT: uunpklo z1.h, z1.b +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: uunpklo z6.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z7.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z3.s, z3.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: stp q5, q3, [x1] +; SVE-NEXT: stp q4, q2, [x1, #64] +; SVE-NEXT: stp q6, q0, [x1, #96] +; SVE-NEXT: stp q7, q1, [x1, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v32i8_v32i32: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.h, z2.b +; SVE2-NEXT: uunpklo z5.h, z0.b +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z16.h, z4.b +; SVE2-NEXT: ext z1.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z18.h, z0.b +; SVE2-NEXT: ext z0.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: uunpklo z3.s, z5.h +; SVE2-NEXT: uunpklo z1.s, z1.h +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z4.b, { z16.b, z17.b }, #8 +; SVE2-NEXT: ext z5.b, { z18.b, z19.b }, #8 +; SVE2-NEXT: uunpklo z6.s, z16.h +; SVE2-NEXT: stp q3, q0, [x1] +; SVE2-NEXT: uunpklo z3.s, z18.h +; SVE2-NEXT: stp q2, q1, [x1, #64] +; SVE2-NEXT: uunpklo z2.s, z4.h +; SVE2-NEXT: uunpklo z1.s, z5.h +; SVE2-NEXT: stp q3, q1, [x1, #32] +; SVE2-NEXT: stp q6, q2, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: @@ -2012,16 +2419,27 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; extend is a two step process where the container is zero_extend_inreg'd with ; the result feeding a normal zero extend from halfs to doublewords. define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v4i8_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v4i8_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: and z0.h, z0.h, #0xff +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v4i8_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: and z0.h, z0.h, #0xff +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.d, z2.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -2046,22 +2464,39 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { } define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v8i8_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v8i8_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z2.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v8i8_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z1.d, z4.s +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z3.d, z5.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: @@ -2096,35 +2531,65 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { } define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { -; CHECK-LABEL: zext_v16i8_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x0] -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q6, q1, [x0, #32] -; CHECK-NEXT: stp q5, q3, [x0, #64] -; CHECK-NEXT: stp q7, q0, [x0, #96] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v16i8_v16i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.h, z0.b +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z2.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z4.d, z2.s +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z5.d, z3.s +; SVE-NEXT: uunpklo z2.d, z2.s +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: uunpklo z6.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z7.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.d, z3.s +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: stp q4, q2, [x0] +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q6, q1, [x0, #32] +; SVE-NEXT: stp q5, q3, [x0, #64] +; SVE-NEXT: stp q7, q0, [x0, #96] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v16i8_v16i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: uunpklo z2.h, z2.b +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z5.s, z2.h +; SVE2-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z3.s, z4.h +; SVE2-NEXT: ext z7.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.s, z2.h +; SVE2-NEXT: ext z16.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: uunpklo z5.d, z5.s +; SVE2-NEXT: uunpklo z7.d, z7.s +; SVE2-NEXT: ext z6.b, { z3.b, z4.b }, #8 +; SVE2-NEXT: uunpklo z3.d, z3.s +; SVE2-NEXT: uunpklo z16.d, z16.s +; SVE2-NEXT: uunpklo z4.d, z6.s +; SVE2-NEXT: stp q0, q7, [x0] +; SVE2-NEXT: ext z0.b, { z1.b, z2.b }, #8 +; SVE2-NEXT: uunpklo z1.d, z1.s +; SVE2-NEXT: stp q5, q16, [x0, #64] +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: stp q3, q4, [x0, #32] +; SVE2-NEXT: stp q1, q0, [x0, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: @@ -2180,67 +2645,125 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { } define void @zext_v32i8_v32i64(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v32i8_v32i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z4.s, z3.h -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: uunpklo z16.d, z4.s -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z6.s, z2.h -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z5.d, z5.s -; CHECK-NEXT: uunpklo z20.d, z1.s -; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uunpklo z18.d, z6.s -; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: uunpklo z19.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: uunpklo z16.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uunpklo z7.d, z7.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q5, q17, [x1] -; CHECK-NEXT: uunpklo z5.d, z6.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: stp q16, q0, [x1, #32] -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: stp q18, q5, [x1, #64] -; CHECK-NEXT: uunpklo z1.d, z4.s -; CHECK-NEXT: uunpklo z3.d, z6.s -; CHECK-NEXT: stp q7, q1, [x1, #224] -; CHECK-NEXT: stp q2, q3, [x1, #96] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v32i8_v32i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q0, q1, [x0] +; SVE-NEXT: add z0.b, z0.b, z0.b +; SVE-NEXT: add z1.b, z1.b, z1.b +; SVE-NEXT: mov z2.d, z0.d +; SVE-NEXT: uunpklo z3.h, z1.b +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.h, z0.b +; SVE-NEXT: uunpklo z1.h, z1.b +; SVE-NEXT: uunpklo z4.s, z3.h +; SVE-NEXT: uunpklo z2.h, z2.b +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: uunpklo z5.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: mov z7.d, z1.d +; SVE-NEXT: uunpklo z16.d, z4.s +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z6.s, z2.h +; SVE-NEXT: ext z4.b, z4.b, z0.b, #8 +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z7.b, z7.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: mov z17.d, z5.d +; SVE-NEXT: uunpklo z3.s, z3.h +; SVE-NEXT: uunpklo z5.d, z5.s +; SVE-NEXT: uunpklo z20.d, z1.s +; SVE-NEXT: uunpklo z4.d, z4.s +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z7.s, z7.h +; SVE-NEXT: uunpklo z18.d, z6.s +; SVE-NEXT: ext z17.b, z17.b, z0.b, #8 +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: ext z6.b, z6.b, z0.b, #8 +; SVE-NEXT: uunpklo z19.d, z3.s +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: stp q16, q4, [x1, #128] +; SVE-NEXT: uunpklo z16.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z17.d, z17.s +; SVE-NEXT: mov z4.d, z7.d +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: uunpklo z3.d, z3.s +; SVE-NEXT: uunpklo z7.d, z7.s +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q5, q17, [x1] +; SVE-NEXT: uunpklo z5.d, z6.s +; SVE-NEXT: mov z6.d, z2.d +; SVE-NEXT: stp q19, q3, [x1, #160] +; SVE-NEXT: uunpklo z2.d, z2.s +; SVE-NEXT: ext z4.b, z4.b, z0.b, #8 +; SVE-NEXT: stp q16, q0, [x1, #32] +; SVE-NEXT: ext z6.b, z6.b, z0.b, #8 +; SVE-NEXT: stp q20, q1, [x1, #192] +; SVE-NEXT: stp q18, q5, [x1, #64] +; SVE-NEXT: uunpklo z1.d, z4.s +; SVE-NEXT: uunpklo z3.d, z6.s +; SVE-NEXT: stp q7, q1, [x1, #224] +; SVE-NEXT: stp q2, q3, [x1, #96] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v32i8_v32i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.b, z0.b, z0.b +; SVE2-NEXT: add z0.b, z1.b, z1.b +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.h, z2.b +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.h, z0.b +; SVE2-NEXT: uunpklo z6.h, z4.b +; SVE2-NEXT: uunpklo z4.h, z5.b +; SVE2-NEXT: ext z16.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: uunpklo z17.s, z0.h +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z19.s, z6.h +; SVE2-NEXT: uunpklo z21.s, z16.h +; SVE2-NEXT: ext z6.b, { z6.b, z7.b }, #8 +; SVE2-NEXT: ext z7.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z23.s, z0.h +; SVE2-NEXT: ext z0.b, { z17.b, z18.b }, #8 +; SVE2-NEXT: uunpklo z16.d, z17.s +; SVE2-NEXT: ext z1.b, { z4.b, z5.b }, #8 +; SVE2-NEXT: uunpklo z4.s, z4.h +; SVE2-NEXT: uunpklo z3.d, z19.s +; SVE2-NEXT: ext z17.b, { z19.b, z20.b }, #8 +; SVE2-NEXT: uunpklo z19.s, z6.h +; SVE2-NEXT: ext z6.b, { z21.b, z22.b }, #8 +; SVE2-NEXT: uunpklo z18.d, z21.s +; SVE2-NEXT: uunpklo z7.d, z7.s +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: str q16, [x1] +; SVE2-NEXT: ext z21.b, { z4.b, z5.b }, #8 +; SVE2-NEXT: uunpklo z4.d, z4.s +; SVE2-NEXT: uunpklo z5.d, z17.s +; SVE2-NEXT: uunpklo z6.d, z6.s +; SVE2-NEXT: stp q2, q7, [x1, #128] +; SVE2-NEXT: uunpklo z2.d, z23.s +; SVE2-NEXT: stp q3, q5, [x1, #192] +; SVE2-NEXT: ext z3.b, { z23.b, z24.b }, #8 +; SVE2-NEXT: stp q18, q6, [x1, #160] +; SVE2-NEXT: uunpklo z17.s, z1.h +; SVE2-NEXT: uunpklo z1.d, z21.s +; SVE2-NEXT: stp q0, q2, [x1, #16] +; SVE2-NEXT: ext z2.b, { z19.b, z20.b }, #8 +; SVE2-NEXT: uunpklo z3.d, z3.s +; SVE2-NEXT: ext z0.b, { z17.b, z18.b }, #8 +; SVE2-NEXT: stp q4, q1, [x1, #64] +; SVE2-NEXT: uunpklo z4.d, z19.s +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z5.d, z17.s +; SVE2-NEXT: str q3, [x1, #48] +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: stp q4, q2, [x1, #224] +; SVE2-NEXT: stp q5, q0, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: @@ -2440,14 +2963,23 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { -; CHECK-LABEL: zext_v8i16_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v8i16_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v8i16_v8i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z1.s, z2.h +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: @@ -2477,20 +3009,35 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { } define void @zext_v16i16_v16i32(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v16i16_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v16i16_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.h, z0.h, z0.h +; SVE-NEXT: add z1.h, z1.h, z1.h +; SVE-NEXT: uunpklo z2.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v16i16_v16i32: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.h, z0.h, z0.h +; SVE2-NEXT: add z0.h, z1.h, z1.h +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z3.s, z4.h +; SVE2-NEXT: uunpklo z1.s, z5.h +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: @@ -2571,15 +3118,25 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { -; CHECK-LABEL: zext_v4i16_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v4i16_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v4i16_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.d, z2.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -2604,21 +3161,37 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { } define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { -; CHECK-LABEL: zext_v8i16_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v8i16_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z2.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q2, q1, [x0] +; SVE-NEXT: stp q3, q0, [x0, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v8i16_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.s, z0.h +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z1.d, z4.s +; SVE2-NEXT: uunpklo z3.d, z5.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: stp q2, q3, [x0, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: @@ -2654,34 +3227,63 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { } define void @zext_v16i16_v16i64(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v16i16_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.d, z1.s -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v16i16_v16i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.h, z0.h, z0.h +; SVE-NEXT: add z1.h, z1.h, z1.h +; SVE-NEXT: uunpklo z2.s, z0.h +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.s, z1.h +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpklo z4.d, z2.s +; SVE-NEXT: uunpklo z5.d, z3.s +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; SVE-NEXT: ext z3.b, z3.b, z0.b, #8 +; SVE-NEXT: uunpklo z6.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z7.d, z1.s +; SVE-NEXT: uunpklo z2.d, z2.s +; SVE-NEXT: uunpklo z3.d, z3.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: stp q5, q3, [x1] +; SVE-NEXT: stp q4, q2, [x1, #64] +; SVE-NEXT: stp q6, q0, [x1, #96] +; SVE-NEXT: stp q7, q1, [x1, #32] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v16i16_v16i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.h, z0.h, z0.h +; SVE2-NEXT: add z0.h, z1.h, z1.h +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z2.s, z2.h +; SVE2-NEXT: uunpklo z5.s, z0.h +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z16.s, z4.h +; SVE2-NEXT: ext z1.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: uunpklo z18.s, z0.h +; SVE2-NEXT: ext z0.b, { z5.b, z6.b }, #8 +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z3.d, z5.s +; SVE2-NEXT: uunpklo z1.d, z1.s +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: ext z4.b, { z16.b, z17.b }, #8 +; SVE2-NEXT: ext z5.b, { z18.b, z19.b }, #8 +; SVE2-NEXT: uunpklo z6.d, z16.s +; SVE2-NEXT: stp q3, q0, [x1] +; SVE2-NEXT: uunpklo z3.d, z18.s +; SVE2-NEXT: stp q2, q1, [x1, #64] +; SVE2-NEXT: uunpklo z2.d, z4.s +; SVE2-NEXT: uunpklo z1.d, z5.s +; SVE2-NEXT: stp q3, q1, [x1, #32] +; SVE2-NEXT: stp q6, q2, [x1, #96] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: @@ -2777,14 +3379,23 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { -; CHECK-LABEL: zext_v4i32_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v4i32_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: uunpklo z1.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: stp q1, q0, [x0] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v4i32_v4i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0_z1 +; SVE2-NEXT: ext z2.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z1.d, z2.s +; SVE2-NEXT: stp q0, q1, [x0] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: @@ -2808,20 +3419,35 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { } define void @zext_v8i32_v8i64(ptr %in, ptr %out) { -; CHECK-LABEL: zext_v8i32_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: add z0.s, z0.s, z0.s -; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: ret +; SVE-LABEL: zext_v8i32_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp q1, q0, [x0] +; SVE-NEXT: add z0.s, z0.s, z0.s +; SVE-NEXT: add z1.s, z1.s, z1.s +; SVE-NEXT: uunpklo z2.d, z0.s +; SVE-NEXT: ext z0.b, z0.b, z0.b, #8 +; SVE-NEXT: uunpklo z3.d, z1.s +; SVE-NEXT: ext z1.b, z1.b, z0.b, #8 +; SVE-NEXT: uunpklo z0.d, z0.s +; SVE-NEXT: uunpklo z1.d, z1.s +; SVE-NEXT: stp q2, q0, [x1, #32] +; SVE-NEXT: stp q3, q1, [x1] +; SVE-NEXT: ret +; +; SVE2-LABEL: zext_v8i32_v8i64: +; SVE2: // %bb.0: +; SVE2-NEXT: ldp q1, q0, [x0] +; SVE2-NEXT: add z2.s, z0.s, z0.s +; SVE2-NEXT: add z0.s, z1.s, z1.s +; SVE2-NEXT: ext z4.b, { z2.b, z3.b }, #8 +; SVE2-NEXT: ext z5.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: uunpklo z2.d, z2.s +; SVE2-NEXT: uunpklo z0.d, z0.s +; SVE2-NEXT: uunpklo z3.d, z4.s +; SVE2-NEXT: uunpklo z1.d, z5.s +; SVE2-NEXT: stp q0, q1, [x1] +; SVE2-NEXT: stp q2, q3, [x1, #32] +; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index bffef13..d880eba 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -64,18 +64,18 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z2.h, z1.b -; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sunpklo z4.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: sunpklo z6.s, z2.h +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z3.b, { z4.b, z5.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z4.h ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } ; CHECK-NEXT: ptrue p0.b, vl8 @@ -139,46 +139,44 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-LABEL: srem_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z2.h, z1.b -; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: sunpklo z5.h, z1.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: sunpklo z5.h, z5.b -; CHECK-NEXT: sunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: sunpklo z16.h, z3.b +; CHECK-NEXT: sunpklo z0.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z16.h ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: ext z7.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: ext z6.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: sunpklo z16.h, z6.b +; CHECK-NEXT: sunpklo z6.h, z7.b +; CHECK-NEXT: sunpklo z18.s, z16.h +; CHECK-NEXT: sunpklo z19.s, z6.h +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ext z6.b, { z6.b, z7.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: uzp1 z16.h, z0.h, z0.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: uzp1 z17.h, z5.h, z5.h +; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } -; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.h, z3.h, z3.h -; CHECK-NEXT: splice z3.h, p0, { z6.h, z7.h } +; CHECK-NEXT: splice z0.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z19.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z6.b, z0.b, z0.b +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b -; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: splice z0.b, p0, { z6.b, z7.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: msb z0.b, p0/m, z1.b, z3.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -277,84 +275,80 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: sunpklo z4.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z3.h -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.h, z3.b -; CHECK-NEXT: sunpklo z16.h, z4.b -; CHECK-NEXT: sunpklo z3.s, z7.h -; CHECK-NEXT: sunpklo z4.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: movprfx z6, z4 -; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] -; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: sunpklo z17.h, z4.b -; CHECK-NEXT: sunpklo z18.h, z3.b -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: sunpklo z19.s, z17.h -; CHECK-NEXT: sunpklo z20.s, z18.h -; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: sunpklo z17.s, z17.h -; CHECK-NEXT: sunpklo z18.s, z18.h -; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 -; CHECK-NEXT: sunpklo z20.h, z20.b -; CHECK-NEXT: sunpklo z22.s, z20.h -; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 -; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: sunpklo z5.h, z2.b +; CHECK-NEXT: sunpklo z16.h, z0.b +; CHECK-NEXT: sunpklo z4.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: ldr q16, [x1] +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z23.h, z16.b +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z7.s +; CHECK-NEXT: movprfx z7, z6 +; CHECK-NEXT: sdiv z7.s, p0/m, z7.s, z5.s +; CHECK-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z6.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: sunpklo z27.s, z23.h +; CHECK-NEXT: ext z23.b, { z23.b, z24.b }, #8 +; CHECK-NEXT: sunpklo z19.h, z5.b +; CHECK-NEXT: sunpklo z21.h, z6.b +; CHECK-NEXT: sunpklo z23.s, z23.h +; CHECK-NEXT: sunpklo z5.s, z19.h +; CHECK-NEXT: sunpklo z6.s, z21.h +; CHECK-NEXT: ext z19.b, { z19.b, z20.b }, #8 +; CHECK-NEXT: ext z20.b, { z21.b, z22.b }, #8 +; CHECK-NEXT: sunpklo z19.s, z19.h +; CHECK-NEXT: movprfx z18, z6 +; CHECK-NEXT: sdiv z18.s, p0/m, z18.s, z5.s +; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: sunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: sunpklo z18.h, z18.b -; CHECK-NEXT: sunpklo z21.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: sunpklo z18.s, z18.h -; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s -; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h +; CHECK-NEXT: sunpklo z25.h, z5.b +; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: uzp1 z20.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h +; CHECK-NEXT: ext z24.b, { z25.b, z26.b }, #8 +; CHECK-NEXT: sunpklo z28.s, z25.h +; CHECK-NEXT: ext z25.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: sunpklo z24.s, z24.h +; CHECK-NEXT: sdivr z23.s, p0/m, z23.s, z24.s +; CHECK-NEXT: ext z24.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sdivr z27.s, p0/m, z27.s, z28.s +; CHECK-NEXT: sunpklo z28.h, z24.b +; CHECK-NEXT: sunpklo z24.h, z25.b +; CHECK-NEXT: sunpklo z26.s, z28.h +; CHECK-NEXT: sunpklo z30.s, z24.h +; CHECK-NEXT: ext z28.b, { z28.b, z29.b }, #8 +; CHECK-NEXT: ext z24.b, { z24.b, z25.b }, #8 +; CHECK-NEXT: sunpklo z25.s, z28.h +; CHECK-NEXT: sunpklo z24.s, z24.h +; CHECK-NEXT: sdivr z26.s, p0/m, z26.s, z30.s +; CHECK-NEXT: uzp1 z27.h, z27.h, z27.h +; CHECK-NEXT: uzp1 z28.h, z23.h, z23.h +; CHECK-NEXT: sdiv z24.s, p0/m, z24.s, z25.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h -; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } -; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h -; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b +; CHECK-NEXT: splice z4.h, p0, { z27.h, z28.h } +; CHECK-NEXT: splice z7.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z22.h, z26.h, z26.h +; CHECK-NEXT: uzp1 z20.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z23.h, z24.h, z24.h +; CHECK-NEXT: uzp1 z24.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z25.h, z19.h, z19.h +; CHECK-NEXT: splice z18.h, p0, { z22.h, z23.h } +; CHECK-NEXT: uzp1 z22.b, z7.b, z7.b +; CHECK-NEXT: splice z19.h, p0, { z24.h, z25.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } -; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b -; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } +; CHECK-NEXT: uzp1 z21.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z23.b, z19.b, z19.b +; CHECK-NEXT: splice z4.b, p0, { z20.b, z21.b } +; CHECK-NEXT: splice z7.b, p0, { z22.b, z23.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: msb z4.b, p0/m, z16.b, z5.b +; CHECK-NEXT: mls z0.b, p0/m, z7.b, z2.b +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v32i8: @@ -586,25 +580,23 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-LABEL: srem_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: sunpklo z0.s, z1.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z6.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: ext z5.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z6.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z7.h, z5.h, z5.h +; CHECK-NEXT: splice z0.h, p0, { z6.h, z7.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: msb z0.h, p0/m, z1.h, z3.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -662,41 +654,37 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q16, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: mov z7.d, z3.d -; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: mov z6.d, z4.d -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: ext z20.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: ldr q5, [x0] +; CHECK-NEXT: sunpklo z20.s, z20.h +; CHECK-NEXT: sunpklo z18.s, z5.h +; CHECK-NEXT: ext z19.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: sunpklo z19.s, z19.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: ext z18.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: uzp1 z22.h, z4.h, z4.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: ext z19.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: sunpklo z19.s, z19.h +; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: uzp1 z20.h, z7.h, z7.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } +; CHECK-NEXT: uzp1 z21.h, z18.h, z18.h +; CHECK-NEXT: splice z4.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z23.h, z19.h, z19.h +; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: msb z4.h, p0/m, z16.h, z5.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z2.h +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v16i16: @@ -1114,18 +1102,18 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z2.h, z1.b -; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: uunpklo z4.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: uunpklo z6.s, z2.h +; CHECK-NEXT: ext z2.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z3.b, { z4.b, z5.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z4.h ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } ; CHECK-NEXT: ptrue p0.b, vl8 @@ -1189,46 +1177,44 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-LABEL: urem_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.h, z1.b -; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: uunpklo z5.h, z1.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: uunpklo z5.h, z5.b -; CHECK-NEXT: uunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: uunpklo z16.h, z3.b +; CHECK-NEXT: uunpklo z0.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z16.h ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: ext z7.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: ext z6.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: uunpklo z16.h, z6.b +; CHECK-NEXT: uunpklo z6.h, z7.b +; CHECK-NEXT: uunpklo z18.s, z16.h +; CHECK-NEXT: uunpklo z19.s, z6.h +; CHECK-NEXT: ext z16.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: ext z6.b, { z6.b, z7.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: uzp1 z16.h, z0.h, z0.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: uzp1 z17.h, z5.h, z5.h +; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } -; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.h, z3.h, z3.h -; CHECK-NEXT: splice z3.h, p0, { z6.h, z7.h } +; CHECK-NEXT: splice z0.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z19.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z6.b, z0.b, z0.b +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b -; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: splice z0.b, p0, { z6.b, z7.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: msb z0.b, p0/m, z1.b, z3.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1327,84 +1313,80 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z3.h -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.h, z3.b -; CHECK-NEXT: uunpklo z16.h, z4.b -; CHECK-NEXT: uunpklo z3.s, z7.h -; CHECK-NEXT: uunpklo z4.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: movprfx z6, z4 -; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] -; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: uunpklo z17.h, z4.b -; CHECK-NEXT: uunpklo z18.h, z3.b -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: uunpklo z19.s, z17.h -; CHECK-NEXT: uunpklo z20.s, z18.h -; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: uunpklo z17.s, z17.h -; CHECK-NEXT: uunpklo z18.s, z18.h -; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 -; CHECK-NEXT: uunpklo z20.h, z20.b -; CHECK-NEXT: uunpklo z22.s, z20.h -; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 -; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: uunpklo z5.h, z2.b +; CHECK-NEXT: uunpklo z16.h, z0.b +; CHECK-NEXT: uunpklo z4.s, z5.h +; CHECK-NEXT: ext z5.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: ext z6.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: ldr q16, [x1] +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z23.h, z16.b +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z7.s +; CHECK-NEXT: movprfx z7, z6 +; CHECK-NEXT: udiv z7.s, p0/m, z7.s, z5.s +; CHECK-NEXT: ext z5.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: ext z6.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: uunpklo z27.s, z23.h +; CHECK-NEXT: ext z23.b, { z23.b, z24.b }, #8 +; CHECK-NEXT: uunpklo z19.h, z5.b +; CHECK-NEXT: uunpklo z21.h, z6.b +; CHECK-NEXT: uunpklo z23.s, z23.h +; CHECK-NEXT: uunpklo z5.s, z19.h +; CHECK-NEXT: uunpklo z6.s, z21.h +; CHECK-NEXT: ext z19.b, { z19.b, z20.b }, #8 +; CHECK-NEXT: ext z20.b, { z21.b, z22.b }, #8 +; CHECK-NEXT: uunpklo z19.s, z19.h +; CHECK-NEXT: movprfx z18, z6 +; CHECK-NEXT: udiv z18.s, p0/m, z18.s, z5.s +; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: uunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: uunpklo z18.h, z18.b -; CHECK-NEXT: uunpklo z21.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 -; CHECK-NEXT: uunpklo z18.s, z18.h -; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s -; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h +; CHECK-NEXT: uunpklo z25.h, z5.b +; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: uzp1 z20.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h +; CHECK-NEXT: ext z24.b, { z25.b, z26.b }, #8 +; CHECK-NEXT: uunpklo z28.s, z25.h +; CHECK-NEXT: ext z25.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: uunpklo z24.s, z24.h +; CHECK-NEXT: udivr z23.s, p0/m, z23.s, z24.s +; CHECK-NEXT: ext z24.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: udivr z27.s, p0/m, z27.s, z28.s +; CHECK-NEXT: uunpklo z28.h, z24.b +; CHECK-NEXT: uunpklo z24.h, z25.b +; CHECK-NEXT: uunpklo z26.s, z28.h +; CHECK-NEXT: uunpklo z30.s, z24.h +; CHECK-NEXT: ext z28.b, { z28.b, z29.b }, #8 +; CHECK-NEXT: ext z24.b, { z24.b, z25.b }, #8 +; CHECK-NEXT: uunpklo z25.s, z28.h +; CHECK-NEXT: uunpklo z24.s, z24.h +; CHECK-NEXT: udivr z26.s, p0/m, z26.s, z30.s +; CHECK-NEXT: uzp1 z27.h, z27.h, z27.h +; CHECK-NEXT: uzp1 z28.h, z23.h, z23.h +; CHECK-NEXT: udiv z24.s, p0/m, z24.s, z25.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h -; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } -; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h -; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b +; CHECK-NEXT: splice z4.h, p0, { z27.h, z28.h } +; CHECK-NEXT: splice z7.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z22.h, z26.h, z26.h +; CHECK-NEXT: uzp1 z20.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z23.h, z24.h, z24.h +; CHECK-NEXT: uzp1 z24.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z25.h, z19.h, z19.h +; CHECK-NEXT: splice z18.h, p0, { z22.h, z23.h } +; CHECK-NEXT: uzp1 z22.b, z7.b, z7.b +; CHECK-NEXT: splice z19.h, p0, { z24.h, z25.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } -; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b -; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } +; CHECK-NEXT: uzp1 z21.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z23.b, z19.b, z19.b +; CHECK-NEXT: splice z4.b, p0, { z20.b, z21.b } +; CHECK-NEXT: splice z7.b, p0, { z22.b, z23.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: msb z4.b, p0/m, z16.b, z5.b +; CHECK-NEXT: mls z0.b, p0/m, z7.b, z2.b +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v32i8: @@ -1636,25 +1618,23 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-LABEL: urem_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; CHECK-NEXT: uunpklo z0.s, z1.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z6.b, { z3.b, z4.b }, #8 +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: ext z5.b, { z1.b, z2.b }, #8 +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z6.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z7.h, z5.h, z5.h +; CHECK-NEXT: splice z0.h, p0, { z6.h, z7.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: msb z0.h, p0/m, z1.h, z3.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1712,41 +1692,37 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q16, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: mov z7.d, z3.d -; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: mov z6.d, z4.d -; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: ext z20.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: ldr q5, [x0] +; CHECK-NEXT: uunpklo z20.s, z20.h +; CHECK-NEXT: uunpklo z18.s, z5.h +; CHECK-NEXT: ext z19.b, { z5.b, z6.b }, #8 +; CHECK-NEXT: uunpklo z19.s, z19.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: ext z18.b, { z16.b, z17.b }, #8 +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: uzp1 z22.h, z4.h, z4.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: ext z19.b, { z2.b, z3.b }, #8 +; CHECK-NEXT: uunpklo z19.s, z19.h +; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: uzp1 z20.h, z7.h, z7.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h -; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } +; CHECK-NEXT: uzp1 z21.h, z18.h, z18.h +; CHECK-NEXT: splice z4.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z23.h, z19.h, z19.h +; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: msb z4.h, p0/m, z16.h, z5.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z2.h +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v16i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll index 3627390..bfa4bc2 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -97,17 +97,17 @@ entry: define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q0, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] -; CHECK-NEXT: add z2.s, z0.s, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: add z3.s, z3.s, z3.s -; CHECK-NEXT: add z4.s, z4.s, z4.s -; CHECK-NEXT: mov z0.s, s0 -; CHECK-NEXT: stp q1, q2, [x0, #32] -; CHECK-NEXT: stp q3, q4, [x0] +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ext z3.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: add z2.s, z2.s, z2.s +; CHECK-NEXT: add z1.s, z0.s, z0.s +; CHECK-NEXT: mov z0.s, s3 +; CHECK-NEXT: add z3.s, z4.s, z4.s +; CHECK-NEXT: add z4.s, z5.s, z5.s +; CHECK-NEXT: stp q2, q1, [x0, #32] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: stp q3, q4, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: test2: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll index 93d6da1..1caf89f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve < %s | FileCheck %s ; RUN: llc -mattr=+dotprod,+sve < %s | FileCheck %s -check-prefix=DOT -; RUN: llc -mattr=+dotprod,+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE +; RUN: llc -mattr=+dotprod,+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE ; RUN: llc -mattr=+dotprod,+sme -force-streaming < %s | FileCheck %s --check-prefix=STREAMING-SVE target triple = "aarch64-unknown-linux-gnu" @@ -36,34 +36,33 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) { ; ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8: ; STREAMING-SVE: // %bb.0: -; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b -; STREAMING-SVE-NEXT: uunpklo z3.h, z0.b +; STREAMING-SVE-NEXT: mov z3.d, z0.d +; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; STREAMING-SVE-NEXT: ext z0.b, { z1.b, z2.b }, #8 ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b -; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b -; STREAMING-SVE-NEXT: uunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: uunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: mov z5.d, z1.d -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 -; STREAMING-SVE-NEXT: uunpklo z7.s, z0.h +; STREAMING-SVE-NEXT: ext z5.b, { z3.b, z4.b }, #8 +; STREAMING-SVE-NEXT: uunpklo z6.h, z0.b +; STREAMING-SVE-NEXT: uunpklo z3.h, z3.b +; STREAMING-SVE-NEXT: ext z0.b, { z1.b, z2.b }, #8 ; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 -; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h -; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h +; STREAMING-SVE-NEXT: uunpklo z16.h, z5.b +; STREAMING-SVE-NEXT: ext z5.b, { z6.b, z7.b }, #8 +; STREAMING-SVE-NEXT: ext z19.b, { z3.b, z4.b }, #8 +; STREAMING-SVE-NEXT: uunpklo z2.s, z3.h ; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h -; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s -; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s -; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s +; STREAMING-SVE-NEXT: uunpklo z6.s, z6.h +; STREAMING-SVE-NEXT: ext z18.b, { z16.b, z17.b }, #8 +; STREAMING-SVE-NEXT: uunpklo z3.s, z5.h +; STREAMING-SVE-NEXT: uunpklo z5.s, z19.h +; STREAMING-SVE-NEXT: uunpklo z7.s, z16.h +; STREAMING-SVE-NEXT: add z1.s, z2.s, z1.s +; STREAMING-SVE-NEXT: uunpklo z4.s, z18.h +; STREAMING-SVE-NEXT: add z0.s, z5.s, z0.s +; STREAMING-SVE-NEXT: add z2.s, z7.s, z6.s +; STREAMING-SVE-NEXT: add z3.s, z4.s, z3.s +; STREAMING-SVE-NEXT: add z1.s, z1.s, z2.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z3.s ; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s ; STREAMING-SVE-NEXT: fmov w0, s0 @@ -103,34 +102,33 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) { ; ; STREAMING-SVE-LABEL: reduce_saddv_v16i8: ; STREAMING-SVE: // %bb.0: -; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b -; STREAMING-SVE-NEXT: sunpklo z3.h, z0.b +; STREAMING-SVE-NEXT: mov z3.d, z0.d +; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1_z2 +; STREAMING-SVE-NEXT: ext z0.b, { z1.b, z2.b }, #8 ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b -; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b -; STREAMING-SVE-NEXT: sunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: sunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: mov z5.d, z1.d -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 -; STREAMING-SVE-NEXT: sunpklo z7.s, z0.h +; STREAMING-SVE-NEXT: ext z5.b, { z3.b, z4.b }, #8 +; STREAMING-SVE-NEXT: sunpklo z6.h, z0.b +; STREAMING-SVE-NEXT: sunpklo z3.h, z3.b +; STREAMING-SVE-NEXT: ext z0.b, { z1.b, z2.b }, #8 ; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z0.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 -; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h -; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h +; STREAMING-SVE-NEXT: sunpklo z16.h, z5.b +; STREAMING-SVE-NEXT: ext z5.b, { z6.b, z7.b }, #8 +; STREAMING-SVE-NEXT: ext z19.b, { z3.b, z4.b }, #8 +; STREAMING-SVE-NEXT: sunpklo z2.s, z3.h ; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h -; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s -; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s -; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s +; STREAMING-SVE-NEXT: sunpklo z6.s, z6.h +; STREAMING-SVE-NEXT: ext z18.b, { z16.b, z17.b }, #8 +; STREAMING-SVE-NEXT: sunpklo z3.s, z5.h +; STREAMING-SVE-NEXT: sunpklo z5.s, z19.h +; STREAMING-SVE-NEXT: sunpklo z7.s, z16.h +; STREAMING-SVE-NEXT: add z1.s, z2.s, z1.s +; STREAMING-SVE-NEXT: sunpklo z4.s, z18.h +; STREAMING-SVE-NEXT: add z0.s, z5.s, z0.s +; STREAMING-SVE-NEXT: add z2.s, z7.s, z6.s +; STREAMING-SVE-NEXT: add z3.s, z4.s, z3.s +; STREAMING-SVE-NEXT: add z1.s, z1.s, z2.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z3.s ; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s ; STREAMING-SVE-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splice.ll b/llvm/test/CodeGen/AArch64/sve-vector-splice.ll new file mode 100644 index 0000000..5d2a125 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-splice.ll @@ -0,0 +1,253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=SVE +; RUN: llc -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=SVE2 + +target triple = "aarch64-unknown-linux-gnu" + +; Test vector_splice patterns. +; Note that this test is similar to named-vector-shuffles-sve.ll, but it focuses +; on testing all supported types, and a positive "splice index". + + +; i8 elements +define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { +; SVE-LABEL: splice_nxv16i8: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv16i8: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #1 +; SVE2-NEXT: ret + %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1) + ret <vscale x 16 x i8> %res +} + +; i16 elements +define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; SVE-LABEL: splice_nxv8i16: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #2 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv8i16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #2 +; SVE2-NEXT: ret + %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1) + ret <vscale x 8 x i16> %res +} + +; bf16 elements + +define <vscale x 8 x bfloat> @splice_nxv8bfloat(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) { +; SVE-LABEL: splice_nxv8bfloat: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #2 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv8bfloat: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #2 +; SVE2-NEXT: ret + %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bfloat(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 1) + ret <vscale x 8 x bfloat> %res +} + +define <vscale x 4 x bfloat> @splice_nxv4bfloat(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) { +; SVE-LABEL: splice_nxv4bfloat: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #4 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv4bfloat: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #4 +; SVE2-NEXT: ret + %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bfloat(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 1) + ret <vscale x 4 x bfloat> %res +} + +define <vscale x 2 x bfloat> @splice_nxv2bfloat(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) { +; SVE-LABEL: splice_nxv2bfloat: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #8 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv2bfloat: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: ret + %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv4bfloat(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 1) + ret <vscale x 2 x bfloat> %res +} + +; f16 elements + +define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) { +; SVE-LABEL: splice_nxv8f16: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #2 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv8f16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #2 +; SVE2-NEXT: ret + %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1) + ret <vscale x 8 x half> %res +} + +define <vscale x 4 x half> @splice_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) { +; SVE-LABEL: splice_nxv4f16: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #4 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv4f16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #4 +; SVE2-NEXT: ret + %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1) + ret <vscale x 4 x half> %res +} + +define <vscale x 2 x half> @splice_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) { +; SVE-LABEL: splice_nxv2f16: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #8 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv2f16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: ret + %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1) + ret <vscale x 2 x half> %res +} + +; i32 elements +define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; SVE-LABEL: splice_nxv4i32: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #4 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv4i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #4 +; SVE2-NEXT: ret + %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1) + ret <vscale x 4 x i32> %res +} + +; f32 elements + +define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) { +; SVE-LABEL: splice_nxv4f32: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #4 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv4f32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #4 +; SVE2-NEXT: ret + %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1) + ret <vscale x 4 x float> %res +} + +define <vscale x 2 x float> @splice_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) { +; SVE-LABEL: splice_nxv2f32: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #8 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv2f32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: ret + %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1) + ret <vscale x 2 x float> %res +} + +; i64 elements +define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { +; SVE-LABEL: splice_nxv2i64: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #8 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv2i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: ret + %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1) + ret <vscale x 2 x i64> %res +} + +; f64 elements +define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) { +; SVE-LABEL: splice_nxv2f64: +; SVE: // %bb.0: +; SVE-NEXT: ext z0.b, z0.b, z1.b, #8 +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_nxv2f64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: ext z0.b, { z0.b, z1.b }, #8 +; SVE2-NEXT: ret + %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1) + ret <vscale x 2 x double> %res +} + +declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32) +declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32) +declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32) +declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32) + +declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32) +declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32) +declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) +declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32) +declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32) + +declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32) +declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32) +declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32) +declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32) +declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32) +declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32) +declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32) + +declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32) +declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32) +declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) diff --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll index b96fad8..6fd3aff 100644 --- a/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll @@ -52,9 +52,8 @@ define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2 ; CHECK-LABEL: extract_v4i64_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -70,9 +69,8 @@ define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) vscale_range( ; CHECK-LABEL: extract_v4double_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -88,9 +86,8 @@ define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2 ; CHECK-LABEL: extract_v8i32_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -110,9 +107,8 @@ define void @extract_v8i32_halves_intrinsic(ptr %in, ptr %out, ptr %out2) vscale ; CHECK-LABEL: extract_v8i32_halves_intrinsic: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -128,9 +124,8 @@ define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2 ; CHECK-LABEL: extract_v8float_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -146,9 +141,8 @@ define void @extract_v8i32_half_unaligned(<8 x i32> %unused, ptr %in, ptr %out) ; CHECK-LABEL: extract_v8i32_half_unaligned: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8 ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret entry: @@ -162,15 +156,13 @@ define void @extract_v8i32_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr ; CHECK-LABEL: extract_v8i32_quarters: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #24 -; CHECK-NEXT: str d1, [x1] -; CHECK-NEXT: str d2, [x2] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: ext z3.b, { z0.b, z1.b }, #24 +; CHECK-NEXT: ext z4.b, { z0.b, z1.b }, #8 +; CHECK-NEXT: str d2, [x1] +; CHECK-NEXT: str d3, [x2] ; CHECK-NEXT: str d0, [x3] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: str d4, [x4] ; CHECK-NEXT: ret entry: %b = load <8 x i32>, ptr %in @@ -189,9 +181,8 @@ define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2, ; CHECK-LABEL: extract_v16i16_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -223,9 +214,8 @@ define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2 ; CHECK-LABEL: extract_v16half_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -241,9 +231,8 @@ define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2 ; CHECK-LABEL: extract_v32i8_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #16 +; CHECK-NEXT: str q2, [x1] ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: ret entry: @@ -264,9 +253,8 @@ define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4 ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 -; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #32 +; CHECK-NEXT: st1d { z2.d }, p0, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret entry: @@ -283,9 +271,8 @@ define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 -; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #32 +; CHECK-NEXT: st1w { z2.s }, p0, [x1] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret entry: @@ -302,9 +289,8 @@ define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 -; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #32 +; CHECK-NEXT: st1h { z2.h }, p0, [x1] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret entry: @@ -322,9 +308,8 @@ define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4 ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 -; CHECK-NEXT: st1b { z1.b }, p0, [x1] +; CHECK-NEXT: ext z2.b, { z0.b, z1.b }, #32 +; CHECK-NEXT: st1b { z2.b }, p0, [x1] ; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret entry: |