diff options
Diffstat (limited to 'llvm/test/CodeGen/AArch64/arm64-vmul.ll')
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-vmul.ll | 1358 |
1 files changed, 882 insertions, 476 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 937a17c..07400bb 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1,12 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+aes | FileCheck %s +; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for pmull8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_low +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_high +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_low +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_high +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_64 define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull8h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull.8h v0, v0, v1 +; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -19,7 +57,7 @@ define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull.4s v0, v0, v1 +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -32,7 +70,7 @@ define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull.2d v0, v0, v1 +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -49,7 +87,7 @@ define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull.8h v0, v0, v1 +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -62,7 +100,7 @@ define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull.4s v0, v0, v1 +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -75,7 +113,7 @@ define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull.2d v0, v0, v1 +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -92,7 +130,7 @@ define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.4s v0, v0, v1 +; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -105,7 +143,7 @@ define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.2d v0, v0, v1 +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -114,12 +152,19 @@ define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind { } define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: sqdmull2_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1, #8] -; CHECK-NEXT: sqdmull.4s v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmull2_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: ldr d1, [x1, #8] +; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmull2_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: sqdmull2 v0.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A %load2 = load <8 x i16>, ptr %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -129,12 +174,19 @@ define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind { } define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: sqdmull2_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1, #8] -; CHECK-NEXT: sqdmull.2d v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmull2_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: ldr d1, [x1, #8] +; CHECK-SD-NEXT: sqdmull v0.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmull2_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A %load2 = load <4 x i32>, ptr %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> @@ -152,7 +204,7 @@ define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -167,7 +219,7 @@ define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmulh.4h v0, v0, v1 +; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -180,7 +232,7 @@ define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqdmulh.8h v0, v0, v1 +; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -193,7 +245,7 @@ define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmulh.2s v0, v0, v1 +; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -206,7 +258,7 @@ define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqdmulh.4s v0, v0, v1 +; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -241,7 +293,7 @@ define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrdmulh.4h v0, v0, v1 +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -254,7 +306,7 @@ define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrdmulh.8h v0, v0, v1 +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -267,7 +319,7 @@ define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrdmulh.2s v0, v0, v1 +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -280,7 +332,7 @@ define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrdmulh.4s v0, v0, v1 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -289,15 +341,23 @@ define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind { } define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: sqrdmulh_1s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: sqrdmulh s0, s0, s1 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqrdmulh_1s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr w8, [x0] +; CHECK-SD-NEXT: ldr w9, [x1] +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: fmov s1, w9 +; CHECK-SD-NEXT: sqrdmulh s0, s0, s1 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqrdmulh_1s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: ldr s1, [x1] +; CHECK-GI-NEXT: sqrdmulh s0, s0, s1 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %tmp1 = load i32, ptr %A %tmp2 = load i32, ptr %B %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2) @@ -315,7 +375,7 @@ define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: fmulx.2s v0, v0, v1 +; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B @@ -328,7 +388,7 @@ define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmulx.4s v0, v0, v1 +; CHECK-NEXT: fmulx v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B @@ -341,7 +401,7 @@ define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmulx.2d v0, v0, v1 +; CHECK-NEXT: fmulx v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x double>, ptr %A %tmp2 = load <2 x double>, ptr %B @@ -359,7 +419,7 @@ define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.4s v0, v1, v2 +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -375,7 +435,7 @@ define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.2d v0, v1, v2 +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -386,14 +446,24 @@ define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind { } define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { -; CHECK-LABEL: smlal8h_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: movi.16b v3, #1 -; CHECK-NEXT: smlal.8h v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: smlal.8h v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: smlal8h_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v3.16b, #1 +; CHECK-SD-NEXT: smlal v3.8h, v0.8b, v2.8b +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: smlal v3.8h, v1.8b, v0.8b +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smlal8h_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvn v3.8b, v2.8b +; CHECK-GI-NEXT: smull v1.8h, v1.8b, v3.8b +; CHECK-GI-NEXT: movi v3.16b, #1 +; CHECK-GI-NEXT: smlal v1.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: add v0.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3) %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257> @@ -404,15 +474,26 @@ define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, < } define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { -; CHECK-LABEL: smlal2d_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 // =0x101 -; CHECK-NEXT: dup.2d v3, x8 -; CHECK-NEXT: smlal.2d v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: smlal.2d v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: smlal2d_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #257 // =0x101 +; CHECK-SD-NEXT: dup v3.2d, x8 +; CHECK-SD-NEXT: smlal v3.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: smlal v3.2d, v1.2s, v0.2s +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smlal2d_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvn v3.8b, v2.8b +; CHECK-GI-NEXT: adrp x8, .LCPI27_0 +; CHECK-GI-NEXT: smull v1.2d, v1.2s, v3.2s +; CHECK-GI-NEXT: smlal v1.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] +; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3) %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257> @@ -428,7 +509,7 @@ define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.4s v0, v1, v2 +; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -444,7 +525,7 @@ define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.2d v0, v1, v2 +; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -457,10 +538,10 @@ define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind { define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { ; CHECK-LABEL: smlsl8h_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.16b v3, #1 -; CHECK-NEXT: smlsl.8h v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: smlsl.8h v3, v1, v0 +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: smlsl v3.8h, v0.8b, v2.8b +; CHECK-NEXT: mvn v0.8b, v2.8b +; CHECK-NEXT: smlsl v3.8h, v1.8b, v0.8b ; CHECK-NEXT: str q3, [x0] ; CHECK-NEXT: ret %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> @@ -473,15 +554,25 @@ define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, < } define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { -; CHECK-LABEL: smlsl2d_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 // =0x101 -; CHECK-NEXT: dup.2d v3, x8 -; CHECK-NEXT: smlsl.2d v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: smlsl.2d v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: smlsl2d_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #257 // =0x101 +; CHECK-SD-NEXT: dup v3.2d, x8 +; CHECK-SD-NEXT: smlsl v3.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: smlsl v3.2d, v1.2s, v0.2s +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smlsl2d_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI31_0] +; CHECK-GI-NEXT: smlsl v3.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: mvn v0.8b, v2.8b +; CHECK-GI-NEXT: smlsl v3.2d, v1.2s, v0.2s +; CHECK-GI-NEXT: str q3, [x0] +; CHECK-GI-NEXT: ret %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3) %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1 @@ -502,7 +593,7 @@ define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -518,7 +609,7 @@ define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -529,13 +620,21 @@ define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; CHECK-LABEL: sqdmlal2_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1, #8] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x2] +; CHECK-SD-NEXT: ldr d1, [x0, #8] +; CHECK-SD-NEXT: ldr d2, [x1, #8] +; CHECK-SD-NEXT: sqdmlal v0.4s, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q2, [x1] +; CHECK-GI-NEXT: ldr q0, [x2] +; CHECK-GI-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h +; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A %load2 = load <8 x i16>, ptr %B %tmp3 = load <4 x i32>, ptr %C @@ -547,13 +646,21 @@ define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; CHECK-LABEL: sqdmlal2_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1, #8] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x2] +; CHECK-SD-NEXT: ldr d1, [x0, #8] +; CHECK-SD-NEXT: ldr d2, [x1, #8] +; CHECK-SD-NEXT: sqdmlal v0.2d, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q2, [x1] +; CHECK-GI-NEXT: ldr q0, [x2] +; CHECK-GI-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s +; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A %load2 = load <4 x i32>, ptr %B %tmp3 = load <2 x i64>, ptr %C @@ -570,7 +677,7 @@ define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -586,7 +693,7 @@ define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -597,13 +704,21 @@ define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; CHECK-LABEL: sqdmlsl2_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1, #8] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x2] +; CHECK-SD-NEXT: ldr d1, [x0, #8] +; CHECK-SD-NEXT: ldr d2, [x1, #8] +; CHECK-SD-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q2, [x1] +; CHECK-GI-NEXT: ldr q0, [x2] +; CHECK-GI-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h +; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A %load2 = load <8 x i16>, ptr %B %tmp3 = load <4 x i32>, ptr %C @@ -615,13 +730,21 @@ define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; CHECK-LABEL: sqdmlsl2_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1, #8] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x2] +; CHECK-SD-NEXT: ldr d1, [x0, #8] +; CHECK-SD-NEXT: ldr d2, [x1, #8] +; CHECK-SD-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q2, [x1] +; CHECK-GI-NEXT: ldr q0, [x2] +; CHECK-GI-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s +; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A %load2 = load <4 x i32>, ptr %B %tmp3 = load <2 x i64>, ptr %C @@ -638,7 +761,7 @@ define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.4s v0, v1, v2 +; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -654,7 +777,7 @@ define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.2d v0, v1, v2 +; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -665,14 +788,24 @@ define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind { } define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { -; CHECK-LABEL: umlal8h_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: movi.16b v3, #1 -; CHECK-NEXT: umlal.8h v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: umlal.8h v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: umlal8h_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v3.16b, #1 +; CHECK-SD-NEXT: umlal v3.8h, v0.8b, v2.8b +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: umlal v3.8h, v1.8b, v0.8b +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: umlal8h_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvn v3.8b, v2.8b +; CHECK-GI-NEXT: umull v1.8h, v1.8b, v3.8b +; CHECK-GI-NEXT: movi v3.16b, #1 +; CHECK-GI-NEXT: umlal v1.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: add v0.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3) %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257> @@ -683,15 +816,26 @@ define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, < } define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { -; CHECK-LABEL: umlal2d_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 // =0x101 -; CHECK-NEXT: dup.2d v3, x8 -; CHECK-NEXT: umlal.2d v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: umlal.2d v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: umlal2d_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #257 // =0x101 +; CHECK-SD-NEXT: dup v3.2d, x8 +; CHECK-SD-NEXT: umlal v3.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: umlal v3.2d, v1.2s, v0.2s +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: umlal2d_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvn v3.8b, v2.8b +; CHECK-GI-NEXT: adrp x8, .LCPI43_0 +; CHECK-GI-NEXT: umull v1.2d, v1.2s, v3.2s +; CHECK-GI-NEXT: umlal v1.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI43_0] +; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3) %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257> @@ -707,7 +851,7 @@ define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.4s v0, v1, v2 +; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -723,7 +867,7 @@ define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.2d v0, v1, v2 +; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -736,10 +880,10 @@ define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind { define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { ; CHECK-LABEL: umlsl8h_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.16b v3, #1 -; CHECK-NEXT: umlsl.8h v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: umlsl.8h v3, v1, v0 +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: umlsl v3.8h, v0.8b, v2.8b +; CHECK-NEXT: mvn v0.8b, v2.8b +; CHECK-NEXT: umlsl v3.8h, v1.8b, v0.8b ; CHECK-NEXT: str q3, [x0] ; CHECK-NEXT: ret %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> @@ -752,15 +896,25 @@ define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, < } define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { -; CHECK-LABEL: umlsl2d_chain_with_constant: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 // =0x101 -; CHECK-NEXT: dup.2d v3, x8 -; CHECK-NEXT: umlsl.2d v3, v0, v2 -; CHECK-NEXT: mvn.8b v0, v2 -; CHECK-NEXT: umlsl.2d v3, v1, v0 -; CHECK-NEXT: str q3, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: umlsl2d_chain_with_constant: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #257 // =0x101 +; CHECK-SD-NEXT: dup v3.2d, x8 +; CHECK-SD-NEXT: umlsl v3.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: mvn v0.8b, v2.8b +; CHECK-SD-NEXT: umlsl v3.2d, v1.2s, v0.2s +; CHECK-SD-NEXT: str q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: umlsl2d_chain_with_constant: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI47_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI47_0] +; CHECK-GI-NEXT: umlsl v3.2d, v0.2s, v2.2s +; CHECK-GI-NEXT: mvn v0.8b, v2.8b +; CHECK-GI-NEXT: umlsl v3.2d, v1.2s, v0.2s +; CHECK-GI-NEXT: str q3, [x0] +; CHECK-GI-NEXT: ret %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3) %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1 @@ -776,7 +930,7 @@ define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmla.2s v0, v2, v1 +; CHECK-NEXT: fmla v0.2s, v2.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B @@ -791,7 +945,7 @@ define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmla.4s v0, v2, v1 +; CHECK-NEXT: fmla v0.4s, v2.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B @@ -806,7 +960,7 @@ define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmla.2d v0, v2, v1 +; CHECK-NEXT: fmla v0.2d, v2.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x double>, ptr %A %tmp2 = load <2 x double>, ptr %B @@ -825,7 +979,7 @@ define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmls.2s v0, v1, v2 +; CHECK-NEXT: fmls v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B @@ -841,7 +995,7 @@ define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.4s v0, v1, v2 +; CHECK-NEXT: fmls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B @@ -857,7 +1011,7 @@ define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.2d v0, v1, v2 +; CHECK-NEXT: fmls v0.2d, v1.2d, v2.2d ; CHECK-NEXT: ret %tmp1 = load <2 x double>, ptr %A %tmp2 = load <2 x double>, ptr %B @@ -873,7 +1027,7 @@ define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmls.2s v0, v1, v2 +; CHECK-NEXT: fmls v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B @@ -889,7 +1043,7 @@ define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.4s v0, v1, v2 +; CHECK-NEXT: fmls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B @@ -905,7 +1059,7 @@ define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.2d v0, v1, v2 +; CHECK-NEXT: fmls v0.2d, v1.2d, v2.2d ; CHECK-NEXT: ret %tmp1 = load <2 x double>, ptr %A %tmp2 = load <2 x double>, ptr %B @@ -919,7 +1073,7 @@ define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> ; CHECK-LABEL: fmls_indexed_2s: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmls.2s v0, v2, v1[0] +; CHECK-NEXT: fmls v0.2s, v2.2s, v1.s[0] ; CHECK-NEXT: ret entry: %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c @@ -931,7 +1085,7 @@ entry: define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { ; CHECK-LABEL: fmls_indexed_4s: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls.4s v0, v2, v1[0] +; CHECK-NEXT: fmls v0.4s, v2.4s, v1.s[0] ; CHECK-NEXT: ret entry: %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c @@ -943,7 +1097,7 @@ entry: define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { ; CHECK-LABEL: fmls_indexed_2d: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls.2d v0, v2, v1[0] +; CHECK-NEXT: fmls v0.2d, v2.2d, v1.d[0] ; CHECK-NEXT: ret entry: %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c @@ -956,7 +1110,7 @@ define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float ; CHECK-LABEL: fmla_indexed_scalar_2s: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2 -; CHECK-NEXT: fmla.2s v0, v1, v2 +; CHECK-NEXT: fmla v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret entry: %v1 = insertelement <2 x float> undef, float %c, i32 0 @@ -969,7 +1123,7 @@ define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float ; CHECK-LABEL: fmla_indexed_scalar_4s: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: fmla.4s v0, v1, v2[0] +; CHECK-NEXT: fmla v0.4s, v1.4s, v2.s[0] ; CHECK-NEXT: ret entry: %v1 = insertelement <4 x float> undef, float %c, i32 0 @@ -984,7 +1138,7 @@ define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, do ; CHECK-LABEL: fmla_indexed_scalar_2d: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmla.2d v0, v1, v2[0] +; CHECK-NEXT: fmla v0.2d, v1.2d, v2.d[0] ; CHECK-NEXT: ret entry: %v1 = insertelement <2 x double> undef, double %c, i32 0 @@ -997,7 +1151,7 @@ define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x ; CHECK-LABEL: fmls_indexed_2s_strict: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmls.2s v0, v2, v1[0] +; CHECK-NEXT: fmls v0.2s, v2.2s, v1.s[0] ; CHECK-NEXT: ret entry: %0 = fneg <2 x float> %c @@ -1009,7 +1163,7 @@ entry: define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp { ; CHECK-LABEL: fmls_indexed_4s_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls.4s v0, v2, v1[0] +; CHECK-NEXT: fmls v0.4s, v2.4s, v1.s[0] ; CHECK-NEXT: ret entry: %0 = fneg <4 x float> %c @@ -1021,7 +1175,7 @@ entry: define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp { ; CHECK-LABEL: fmls_indexed_2d_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls.2d v0, v2, v1[0] +; CHECK-NEXT: fmls v0.2d, v2.2d, v1.d[0] ; CHECK-NEXT: ret entry: %0 = fneg <2 x double> %c @@ -1034,7 +1188,7 @@ define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b ; CHECK-LABEL: fmla_indexed_scalar_2s_strict: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: fmla.2s v0, v1, v2[0] +; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[0] ; CHECK-NEXT: ret entry: %v1 = insertelement <2 x float> undef, float %c, i32 0 @@ -1047,7 +1201,7 @@ define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b ; CHECK-LABEL: fmla_indexed_scalar_4s_strict: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: fmla.4s v0, v1, v2[0] +; CHECK-NEXT: fmla v0.4s, v1.4s, v2.s[0] ; CHECK-NEXT: ret entry: %v1 = insertelement <4 x float> undef, float %c, i32 0 @@ -1062,7 +1216,7 @@ define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> ; CHECK-LABEL: fmla_indexed_scalar_2d_strict: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmla.2d v0, v1, v2[0] +; CHECK-NEXT: fmla v0.2d, v1.2d, v2.d[0] ; CHECK-NEXT: ret entry: %v1 = insertelement <2 x double> undef, double %c, i32 0 @@ -1081,7 +1235,7 @@ define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: mul_4h: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.4h v0, v0, v1[1] +; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = mul <4 x i16> %A, %tmp3 @@ -1091,7 +1245,7 @@ define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind { define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: mul_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: mul.8h v0, v0, v1[1] +; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %tmp4 = mul <8 x i16> %A, %tmp3 @@ -1102,7 +1256,7 @@ define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: mul_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.2s v0, v0, v1[1] +; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = mul <2 x i32> %A, %tmp3 @@ -1112,7 +1266,7 @@ define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind { define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: mul_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: mul.4s v0, v0, v1[1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = mul <4 x i32> %A, %tmp3 @@ -1120,17 +1274,29 @@ define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind { } define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { -; CHECK-LABEL: mul_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov.d x8, v1[1] -; CHECK-NEXT: mov.d x9, v0[1] -; CHECK-NEXT: mul x10, x11, x10 -; CHECK-NEXT: mul x8, x9, x8 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: mov.d v0[1], x8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov x10, d1 +; CHECK-SD-NEXT: fmov x11, d0 +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: mul x10, x11, x10 +; CHECK-SD-NEXT: mul x8, x9, x8 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret %tmp1 = mul <2 x i64> %A, %B ret <2 x i64> %tmp1 } @@ -1139,7 +1305,7 @@ define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmul_lane_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmul.2s v0, v0, v1[1] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1> %tmp4 = fmul <2 x float> %A, %tmp3 @@ -1149,7 +1315,7 @@ define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: fmul_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul.4s v0, v0, v1[1] +; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = fmul <4 x float> %A, %tmp3 @@ -1159,7 +1325,7 @@ define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { ; CHECK-LABEL: fmul_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul.2d v0, v0, v1[1] +; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1> %tmp4 = fmul <2 x double> %A, %tmp3 @@ -1169,7 +1335,7 @@ define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { ; CHECK-LABEL: fmul_lane_s: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul.s s0, s0, v1[3] +; CHECK-NEXT: fmul s0, s0, v1.s[3] ; CHECK-NEXT: ret %B = extractelement <4 x float> %vec, i32 3 %res = fmul float %A, %B @@ -1179,7 +1345,7 @@ define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { ; CHECK-LABEL: fmul_lane_d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul.d d0, d0, v1[1] +; CHECK-NEXT: fmul d0, d0, v1.d[1] ; CHECK-NEXT: ret %B = extractelement <2 x double> %vec, i32 1 %res = fmul double %A, %B @@ -1192,7 +1358,7 @@ define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmulx.2s v0, v0, v1[1] +; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3) @@ -1202,7 +1368,7 @@ define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx.4s v0, v0, v1[1] +; CHECK-NEXT: fmulx v0.4s, v0.4s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3) @@ -1212,7 +1378,7 @@ define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx.2d v0, v0, v1[1] +; CHECK-NEXT: fmulx v0.2d, v0.2d, v1.d[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3) @@ -1223,7 +1389,7 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4h: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) @@ -1233,7 +1399,7 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) @@ -1244,7 +1410,7 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) @@ -1254,7 +1420,7 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) @@ -1265,7 +1431,7 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_1s: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: sqdmulh.s s0, s1, v0[1] +; CHECK-NEXT: sqdmulh s0, s1, v0.s[1] ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = extractelement <4 x i32> %B, i32 1 @@ -1277,7 +1443,7 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4h: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) @@ -1287,7 +1453,7 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) @@ -1298,7 +1464,7 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) @@ -1308,7 +1474,7 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) @@ -1319,7 +1485,7 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_1s: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: sqrdmulh.s s0, s1, v0[1] +; CHECK-NEXT: sqrdmulh s0, s1, v0.s[1] ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %tmp1 = extractelement <4 x i32> %B, i32 1 @@ -1331,7 +1497,7 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqdmull_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) @@ -1342,7 +1508,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqdmull_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) @@ -1350,10 +1516,16 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { } define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind { -; CHECK-LABEL: sqdmull2_lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull2.4s v0, v0, v1[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmull2_lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmull2_lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -1361,10 +1533,16 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind { } define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind { -; CHECK-LABEL: sqdmull2_lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull2.2d v0, v0, v1[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmull2_lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmull2_lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -1375,7 +1553,7 @@ define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: umull_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umull.4s v0, v0, v1[1] +; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) @@ -1386,7 +1564,7 @@ define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: umull_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umull.2d v0, v0, v1[1] +; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) @@ -1397,7 +1575,7 @@ define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: smull_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smull.4s v0, v0, v1[1] +; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) @@ -1408,7 +1586,7 @@ define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: smull_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smull.2d v0, v0, v1[1] +; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[1] ; CHECK-NEXT: ret %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) @@ -1419,8 +1597,8 @@ define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi ; CHECK-LABEL: smlal_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smlal.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: smlal v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1432,8 +1610,8 @@ define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi ; CHECK-LABEL: smlal_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smlal.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: smlal v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1445,8 +1623,8 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) noun ; CHECK-LABEL: sqdmlal_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmlal.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1458,8 +1636,8 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) noun ; CHECK-LABEL: sqdmlal_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmlal.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: sqdmlal v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1468,11 +1646,18 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) noun } define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { -; CHECK-LABEL: sqdmlal2_lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlal2 v2.4s, v0.8h, v1.h[1] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: sqdmlal v0.4s, v3.4h, v1.h[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -1481,11 +1666,18 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou } define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { -; CHECK-LABEL: sqdmlal2_lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlal2 v2.2d, v0.4s, v1.s[1] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: sqdmlal v0.2d, v3.2s, v1.s[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -1499,7 +1691,7 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlal.h s2, h1, v0[1] +; CHECK-NEXT: sqdmlal s2, h1, v0.h[1] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 @@ -1517,7 +1709,7 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1] +; CHECK-NEXT: sqdmlsl s2, h1, v0.h[1] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 @@ -1530,15 +1722,24 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { -; CHECK-LABEL: sqadd_lane1_sqdmull4s: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull.4s v0, v0, v1 -; CHECK-NEXT: mov.s w8, v0[1] -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: sqadd s0, s0, s1 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqadd_lane1_sqdmull4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: fmov s1, w8 +; CHECK-SD-NEXT: sqadd s0, s0, s1 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqadd_lane1_sqdmull4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: mov s0, v0.s[1] +; CHECK-GI-NEXT: sqadd s0, s1, s0 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) %prod = extractelement <4 x i32> %prod.vec, i32 1 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) @@ -1546,15 +1747,24 @@ define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { } define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { -; CHECK-LABEL: sqsub_lane1_sqdmull4s: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull.4s v0, v0, v1 -; CHECK-NEXT: mov.s w8, v0[1] -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: sqsub s0, s0, s1 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqsub_lane1_sqdmull4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: fmov s1, w8 +; CHECK-SD-NEXT: sqsub s0, s0, s1 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqsub_lane1_sqdmull4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: mov s0, v0.s[1] +; CHECK-GI-NEXT: sqsub s0, s1, s0 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) %prod = extractelement <4 x i32> %prod.vec, i32 1 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) @@ -1567,7 +1777,7 @@ define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { ; CHECK-NEXT: fmov d1, x0 ; CHECK-NEXT: fmov s2, w1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlal.s d1, s2, v0[1] +; CHECK-NEXT: sqdmlal d1, s2, v0.s[1] ; CHECK-NEXT: fmov x0, d1 ; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 @@ -1584,7 +1794,7 @@ define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { ; CHECK-NEXT: fmov d1, x0 ; CHECK-NEXT: fmov s2, w1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1] +; CHECK-NEXT: sqdmlsl d1, s2, v0.s[1] ; CHECK-NEXT: fmov x0, d1 ; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 @@ -1599,8 +1809,8 @@ define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi ; CHECK-LABEL: umlal_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umlal.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: umlal v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1612,8 +1822,8 @@ define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi ; CHECK-LABEL: umlal_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umlal.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: umlal v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1626,8 +1836,8 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi ; CHECK-LABEL: smlsl_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smlsl.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: smlsl v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1639,8 +1849,8 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi ; CHECK-LABEL: smlsl_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: smlsl.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: smlsl v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1652,8 +1862,8 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) noun ; CHECK-LABEL: sqdmlsl_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmlsl.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: sqdmlsl v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1665,8 +1875,8 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) noun ; CHECK-LABEL: sqdmlsl_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqdmlsl.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: sqdmlsl v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1675,11 +1885,18 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) noun } define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { -; CHECK-LABEL: sqdmlsl2_lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlsl2 v2.4s, v0.8h, v1.h[1] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: sqdmlsl v0.4s, v3.4h, v1.h[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -1688,11 +1905,18 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou } define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { -; CHECK-LABEL: sqdmlsl2_lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlsl2 v2.2d, v0.4s, v1.s[1] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: sqdmlsl v0.2d, v3.2s, v1.s[1] +; CHECK-GI-NEXT: ret %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -1704,8 +1928,8 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi ; CHECK-LABEL: umlsl_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umlsl.4s v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: umlsl v2.4s, v0.4h, v1.h[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) @@ -1717,8 +1941,8 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi ; CHECK-LABEL: umlsl_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umlsl.2d v2, v0, v1[1] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: umlsl v2.2d, v0.2s, v1.s[1] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) @@ -1748,7 +1972,7 @@ define double @fmulxd(double %a, double %b) nounwind { define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { ; CHECK-LABEL: fmulxs_lane: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx.s s0, s0, v1[3] +; CHECK-NEXT: fmulx s0, s0, v1.s[3] ; CHECK-NEXT: ret %b = extractelement <4 x float> %vec, i32 3 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind @@ -1758,7 +1982,7 @@ define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { ; CHECK-LABEL: fmulxd_lane: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx.d d0, d0, v1[1] +; CHECK-NEXT: fmulx d0, d0, v1.d[1] ; CHECK-NEXT: ret %b = extractelement <2 x double> %vec, i32 1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind @@ -1772,7 +1996,7 @@ declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: smull2_8h_simple: ; CHECK: // %bb.0: -; CHECK-NEXT: smull2.8h v0, v0, v1 +; CHECK-NEXT: smull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> @@ -1783,7 +2007,7 @@ define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: foo0: ; CHECK: // %bb.0: -; CHECK-NEXT: smull2.8h v0, v0, v1 +; CHECK-NEXT: smull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1798,7 +2022,7 @@ define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: foo1: ; CHECK: // %bb.0: -; CHECK-NEXT: smull2.4s v0, v0, v1 +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1813,7 +2037,7 @@ define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: foo2: ; CHECK: // %bb.0: -; CHECK-NEXT: smull2.2d v0, v0, v1 +; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1828,7 +2052,7 @@ define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: foo3: ; CHECK: // %bb.0: -; CHECK-NEXT: umull2.8h v0, v0, v1 +; CHECK-NEXT: umull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1843,7 +2067,7 @@ define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: foo4: ; CHECK: // %bb.0: -; CHECK-NEXT: umull2.4s v0, v0, v1 +; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1858,7 +2082,7 @@ define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: foo5: ; CHECK: // %bb.0: -; CHECK-NEXT: umull2.2d v0, v0, v1 +; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -1871,11 +2095,18 @@ define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { } define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { -; CHECK-LABEL: foo6: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smull2.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: foo6: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: smull2 v0.4s, v1.8h, v2.h[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo6: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v2.h[1] +; CHECK-GI-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -1889,7 +2120,7 @@ define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readn ; CHECK-LABEL: foo6a: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smull.4s v0, v1, v2[1] +; CHECK-NEXT: smull v0.4s, v1.4h, v2.h[1] ; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> @@ -1901,11 +2132,18 @@ entry: } define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { -; CHECK-LABEL: foo7: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smull2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: foo7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: smull2 v0.2d, v1.4s, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v2.s[1] +; CHECK-GI-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -1919,7 +2157,7 @@ define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readn ; CHECK-LABEL: foo7a: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smull.2d v0, v1, v2[1] +; CHECK-NEXT: smull v0.2d, v1.2s, v2.s[1] ; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> @@ -1932,11 +2170,18 @@ entry: define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { -; CHECK-LABEL: foo8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umull2.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: foo8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: umull2 v0.4s, v1.8h, v2.h[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v2.h[1] +; CHECK-GI-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -1950,7 +2195,7 @@ define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readn ; CHECK-LABEL: foo8a: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umull.4s v0, v1, v2[1] +; CHECK-NEXT: umull v0.4s, v1.4h, v2.h[1] ; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> @@ -1962,11 +2207,18 @@ entry: } define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { -; CHECK-LABEL: foo9: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umull2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: foo9: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: umull2 v0.2d, v1.4s, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo9: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v2.s[1] +; CHECK-GI-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -1980,7 +2232,7 @@ define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readn ; CHECK-LABEL: foo9a: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umull.2d v0, v1, v2[1] +; CHECK-NEXT: umull v0.2d, v1.2s, v2.s[1] ; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> @@ -1994,7 +2246,7 @@ entry: define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { ; CHECK-LABEL: bar0: ; CHECK: // %bb.0: -; CHECK-NEXT: smlal2.8h v0, v1, v2 +; CHECK-NEXT: smlal2 v0.8h, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2010,7 +2262,7 @@ define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { ; CHECK-LABEL: bar1: ; CHECK: // %bb.0: -; CHECK-NEXT: smlal2.4s v0, v1, v2 +; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2026,7 +2278,7 @@ define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { ; CHECK-LABEL: bar2: ; CHECK: // %bb.0: -; CHECK-NEXT: smlal2.2d v0, v1, v2 +; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2042,7 +2294,7 @@ define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { ; CHECK-LABEL: bar3: ; CHECK: // %bb.0: -; CHECK-NEXT: umlal2.8h v0, v1, v2 +; CHECK-NEXT: umlal2 v0.8h, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2058,7 +2310,7 @@ define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { ; CHECK-LABEL: bar4: ; CHECK: // %bb.0: -; CHECK-NEXT: umlal2.4s v0, v1, v2 +; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2074,7 +2326,7 @@ define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { ; CHECK-LABEL: bar5: ; CHECK: // %bb.0: -; CHECK-NEXT: umlal2.2d v0, v1, v2 +; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2088,11 +2340,18 @@ define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { } define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { -; CHECK-LABEL: mlal2_1: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smlal2.4s v0, v1, v2[3] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mlal2_1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: smlal2 v0.4s, v1.8h, v2.h[3] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mlal2_1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: dup v2.8h, v2.h[3] +; CHECK-GI-NEXT: smlal2 v0.4s, v1.8h, v2.8h +; CHECK-GI-NEXT: ret %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2106,11 +2365,18 @@ define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { } define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { -; CHECK-LABEL: mlal2_2: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: smlal2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mlal2_2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: smlal2 v0.2d, v1.4s, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mlal2_2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: dup v2.4s, v2.s[1] +; CHECK-GI-NEXT: smlal2 v0.2d, v1.4s, v2.4s +; CHECK-GI-NEXT: ret %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2124,11 +2390,18 @@ define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { } define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { -; CHECK-LABEL: mlal2_4: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umlal2.4s v0, v1, v2[2] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mlal2_4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: umlal2 v0.4s, v1.8h, v2.h[2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mlal2_4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: dup v2.8h, v2.h[2] +; CHECK-GI-NEXT: umlal2 v0.4s, v1.8h, v2.8h +; CHECK-GI-NEXT: ret %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2142,11 +2415,18 @@ define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { } define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { -; CHECK-LABEL: mlal2_5: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umlal2.2d v0, v1, v2[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mlal2_5: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: umlal2 v0.2d, v1.4s, v2.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mlal2_5: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: dup v2.4s, v2.s[0] +; CHECK-GI-NEXT: umlal2 v0.2d, v1.4s, v2.4s +; CHECK-GI-NEXT: ret %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> @@ -2164,7 +2444,7 @@ define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone s ; CHECK-LABEL: vmulq_n_f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmul.2d v0, v0, v1[0] +; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[0] ; CHECK-NEXT: ret entry: %vecinit.i = insertelement <2 x double> undef, double %y, i32 0 @@ -2177,7 +2457,7 @@ define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp ; CHECK-LABEL: vmulq_n_f32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: fmul.4s v0, v0, v1[0] +; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[0] ; CHECK-NEXT: ret entry: %vecinit.i = insertelement <4 x float> undef, float %y, i32 0 @@ -2192,7 +2472,7 @@ define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp { ; CHECK-LABEL: vmul_n_f32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: fmul.2s v0, v0, v1[0] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0] ; CHECK-NEXT: ret entry: %vecinit.i = insertelement <2 x float> undef, float %y, i32 0 @@ -2204,7 +2484,7 @@ entry: define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp { ; CHECK-LABEL: vmla_laneq_s16_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mla.4h v0, v1, v2[6] +; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[6] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> @@ -2216,7 +2496,7 @@ entry: define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp { ; CHECK-LABEL: vmla_laneq_s32_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mla.2s v0, v1, v2[3] +; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[3] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -2226,10 +2506,16 @@ entry: } define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp { -; CHECK-LABEL: not_really_vmlaq_laneq_s16_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mla.8h v0, v1, v2[5] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: not_really_vmlaq_laneq_s16_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mla v0.8h, v1.8h, v2.h[5] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: not_really_vmlaq_laneq_s16_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v2.16b, v2.16b, v0.16b, #8 +; CHECK-GI-NEXT: mla v0.8h, v1.8h, v2.h[1] +; CHECK-GI-NEXT: ret entry: %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> @@ -2239,10 +2525,16 @@ entry: } define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp { -; CHECK-LABEL: not_really_vmlaq_laneq_s32_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mla.4s v0, v1, v2[3] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: not_really_vmlaq_laneq_s32_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mla v0.4s, v1.4s, v2.s[3] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: not_really_vmlaq_laneq_s32_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v2.16b, v2.16b, v0.16b, #8 +; CHECK-GI-NEXT: mla v0.4s, v1.4s, v2.s[1] +; CHECK-GI-NEXT: ret entry: %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> @@ -2254,7 +2546,7 @@ entry: define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { ; CHECK-LABEL: vmull_laneq_s16_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: smull.4s v0, v0, v1[6] +; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[6] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> @@ -2265,7 +2557,7 @@ entry: define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { ; CHECK-LABEL: vmull_laneq_s32_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: smull.2d v0, v0, v1[2] +; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[2] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> @@ -2275,7 +2567,7 @@ entry: define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { ; CHECK-LABEL: vmull_laneq_u16_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull.4s v0, v0, v1[6] +; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[6] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> @@ -2286,7 +2578,7 @@ entry: define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { ; CHECK-LABEL: vmull_laneq_u32_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull.2d v0, v0, v1[2] +; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[2] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> @@ -2297,8 +2589,8 @@ entry: define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { ; CHECK-LABEL: vmull_low_n_s16_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup.4h v0, w0 -; CHECK-NEXT: smull.4s v0, v1, v0 +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h ; CHECK-NEXT: ret entry: %conv = trunc i32 %d to i16 @@ -2314,11 +2606,18 @@ entry: } define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { -; CHECK-LABEL: vmull_high_n_s16_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup.8h v0, w0 -; CHECK-NEXT: smull2.4s v0, v1, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmull_high_n_s16_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: dup v0.8h, w0 +; CHECK-SD-NEXT: smull2 v0.4s, v1.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmull_high_n_s16_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: dup v1.4h, w0 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %conv = trunc i32 %d to i16 %0 = bitcast <8 x i16> %b to <2 x i64> @@ -2333,11 +2632,18 @@ entry: } define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { -; CHECK-LABEL: vmull_high_n_s32_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup.4s v0, w0 -; CHECK-NEXT: smull2.2d v0, v1, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmull_high_n_s32_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: dup v0.4s, w0 +; CHECK-SD-NEXT: smull2 v0.2d, v1.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmull_high_n_s32_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: dup v1.2s, w0 +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -2349,11 +2655,18 @@ entry: } define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { -; CHECK-LABEL: vmull_high_n_u16_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup.8h v0, w0 -; CHECK-NEXT: umull2.4s v0, v1, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmull_high_n_u16_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: dup v0.8h, w0 +; CHECK-SD-NEXT: umull2 v0.4s, v1.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmull_high_n_u16_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: dup v1.4h, w0 +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %conv = trunc i32 %d to i16 %0 = bitcast <8 x i16> %b to <2 x i64> @@ -2368,11 +2681,18 @@ entry: } define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { -; CHECK-LABEL: vmull_high_n_u32_test: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup.4s v0, w0 -; CHECK-NEXT: umull2.2d v0, v1, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmull_high_n_u32_test: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: dup v0.4s, w0 +; CHECK-SD-NEXT: umull2 v0.2d, v1.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmull_high_n_u32_test: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v1.d[1] +; CHECK-GI-NEXT: dup v1.2s, w0 +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> @@ -2384,10 +2704,17 @@ entry: } define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: vmul_built_dup_test: -; CHECK: // %bb.0: -; CHECK-NEXT: mul.4s v0, v0, v1[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmul_built_dup_test: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmul_built_dup_test: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov s1, v1.s[1] +; CHECK-GI-NEXT: dup v1.4s, v1.s[0] +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %vget_lane = extractelement <4 x i32> %b, i32 1 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 @@ -2398,11 +2725,19 @@ define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { } define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { -; CHECK-LABEL: vmul_built_dup_fromsmall_test: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.4h v0, v0, v1[3] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmul_built_dup_fromsmall_test: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[3] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmul_built_dup_fromsmall_test: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov h1, v1.h[3] +; CHECK-GI-NEXT: dup v1.4h, v1.h[0] +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 3 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 @@ -2413,11 +2748,18 @@ define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { } define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { -; CHECK-LABEL: vmulq_built_dup_fromsmall_test: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.8h v0, v0, v1[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vmulq_built_dup_fromsmall_test: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mul v0.8h, v0.8h, v1.h[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vmulq_built_dup_fromsmall_test: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: dup v1.8h, v1.h[0] +; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 0 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 @@ -2434,7 +2776,7 @@ define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: mull_from_two_extracts: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull2.2d v0, v0, v1 +; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s ; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> @@ -2446,7 +2788,7 @@ define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: mlal_from_two_extracts: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.2d v0, v1, v2 +; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s ; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> @@ -2459,8 +2801,8 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: mull_from_extract_dup_low: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.2s v1, w0 -; CHECK-NEXT: sqdmull.2d v0, v0, v1 +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -2472,11 +2814,18 @@ define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { } define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: mull_from_extract_dup_high: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.4s v1, w0 -; CHECK-NEXT: sqdmull2.2d v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mull_from_extract_dup_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup v1.4s, w0 +; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mull_from_extract_dup_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v1.2s, w0 +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -2489,8 +2838,8 @@ define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { ; CHECK-LABEL: pmull_from_extract_dup_low: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.8b v1, w0 -; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -2504,8 +2853,8 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { ; CHECK-LABEL: pmull_from_extract_dup_high: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.16b v1, w0 -; CHECK-NEXT: pmull2.8h v0, v0, v1 +; CHECK-NEXT: dup v1.16b, w0 +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -2520,8 +2869,8 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) ; CHECK-LABEL: pmull_from_extract_duplane_low: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: dup.8b v1, v1[0] -; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: dup v1.8b, v1.b[0] +; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -2534,8 +2883,8 @@ define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) ; CHECK-LABEL: pmull_from_extract_duplane_high: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: dup.16b v1, v1[0] -; CHECK-NEXT: pmull2.8h v0, v0, v1 +; CHECK-NEXT: dup v1.16b, v1.b[0] +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b ; CHECK-NEXT: ret %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -2547,7 +2896,7 @@ define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmull_from_extract_duplane_low: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull.2d v0, v0, v1[0] +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] ; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2557,10 +2906,16 @@ define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rh } define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: sqdmull_from_extract_duplane_high: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmull2.2d v0, v0, v1[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmull_from_extract_duplane_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmull_from_extract_duplane_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] +; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2571,7 +2926,7 @@ define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %r define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmlal_from_extract_duplane_low: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal.2d v0, v1, v2[0] +; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0] ; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2582,10 +2937,16 @@ define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> % } define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: sqdmlal_from_extract_duplane_high: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal_from_extract_duplane_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal_from_extract_duplane_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0] +; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2597,7 +2958,7 @@ define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: umlal_from_extract_duplane_low: ; CHECK: // %bb.0: -; CHECK-NEXT: umlal.2d v0, v1, v2[0] +; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[0] ; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2608,10 +2969,16 @@ define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lh } define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: umlal_from_extract_duplane_high: -; CHECK: // %bb.0: -; CHECK-NEXT: umlal2.2d v0, v1, v2[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: umlal_from_extract_duplane_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umlal2 v0.2d, v1.4s, v2.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: umlal_from_extract_duplane_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0] +; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> @@ -2623,7 +2990,7 @@ define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %l define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { ; CHECK-LABEL: scalar_fmla_from_extract_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmla.s s0, s1, v2[3] +; CHECK-NEXT: fmla s0, s1, v2.s[3] ; CHECK-NEXT: ret %rhs = extractelement <4 x float> %rvec, i32 3 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) @@ -2631,11 +2998,18 @@ define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x floa } define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { -; CHECK-LABEL: scalar_fmla_from_extract_v2f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmla.s s0, s1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_fmla_from_extract_v2f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: fmla s0, s1, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_fmla_from_extract_v2f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov s2, v2.s[1] +; CHECK-GI-NEXT: fmadd s0, s1, s2, s0 +; CHECK-GI-NEXT: ret %rhs = extractelement <2 x float> %rvec, i32 1 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) ret float %res @@ -2644,7 +3018,7 @@ define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x floa define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { ; CHECK-LABEL: scalar_fmls_from_extract_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmls.s s0, s1, v2[3] +; CHECK-NEXT: fmls s0, s1, v2.s[3] ; CHECK-NEXT: ret %rhs.scal = extractelement <4 x float> %rvec, i32 3 %rhs = fsub float -0.0, %rhs.scal @@ -2656,7 +3030,7 @@ define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x floa ; CHECK-LABEL: scalar_fmls_from_extract_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls.s s0, s1, v2[1] +; CHECK-NEXT: fmls s0, s1, v2.s[1] ; CHECK-NEXT: ret %rhs.scal = extractelement <2 x float> %rvec, i32 1 %rhs = fsub float -0.0, %rhs.scal @@ -2669,7 +3043,7 @@ declare float @llvm.fma.f32(float, float, float) define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { ; CHECK-LABEL: scalar_fmla_from_extract_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmla.d d0, d1, v2[1] +; CHECK-NEXT: fmla d0, d1, v2.d[1] ; CHECK-NEXT: ret %rhs = extractelement <2 x double> %rvec, i32 1 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) @@ -2679,7 +3053,7 @@ define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x d define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { ; CHECK-LABEL: scalar_fmls_from_extract_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmls.d d0, d1, v2[1] +; CHECK-NEXT: fmls d0, d1, v2.d[1] ; CHECK-NEXT: ret %rhs.scal = extractelement <2 x double> %rvec, i32 1 %rhs = fsub double -0.0, %rhs.scal @@ -2692,7 +3066,7 @@ declare double @llvm.fma.f64(double, double, double) define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmls.2s v0, v1, v2[3] +; CHECK-NEXT: fmls v0.2s, v1.2s, v2.s[3] ; CHECK-NEXT: ret %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3> @@ -2704,7 +3078,7 @@ define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls.2s v0, v1, v2[1] +; CHECK-NEXT: fmls v0.2s, v1.2s, v2.s[1] ; CHECK-NEXT: ret %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1> @@ -2715,7 +3089,7 @@ define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmls.4s v0, v1, v2[3] +; CHECK-NEXT: fmls v0.4s, v1.4s, v2.s[3] ; CHECK-NEXT: ret %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -2727,7 +3101,7 @@ define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls.4s v0, v1, v2[1] +; CHECK-NEXT: fmls v0.4s, v1.4s, v2.s[1] ; CHECK-NEXT: ret %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> @@ -2738,7 +3112,7 @@ define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmls.2d v0, v1, v2[1] +; CHECK-NEXT: fmls v0.2d, v1.2d, v2.d[1] ; CHECK-NEXT: ret %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1> @@ -2770,7 +3144,7 @@ define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind { ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w2 -; CHECK-NEXT: sqdmlal.h s2, h0, v1[0] +; CHECK-NEXT: sqdmlal s2, h0, v1.h[0] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 @@ -2801,7 +3175,7 @@ define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind { ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w2 -; CHECK-NEXT: sqdmlsl.h s2, h0, v1[0] +; CHECK-NEXT: sqdmlsl s2, h0, v1.h[0] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 @@ -2831,7 +3205,7 @@ define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, x1 ; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: pmull.1q v0, v1, v0 +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d ; CHECK-NEXT: ret %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) ret <16 x i8> %val @@ -2840,7 +3214,7 @@ define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { ; CHECK-LABEL: test_pmull_high_64: ; CHECK: // %bb.0: -; CHECK-NEXT: pmull2.1q v0, v0, v1 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: ret %l_hi = extractelement <2 x i64> %l, i32 1 %r_hi = extractelement <2 x i64> %r, i32 1 @@ -2851,15 +3225,23 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { -; CHECK-LABEL: test_mul_v1i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mul x8, x9, x8 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_mul_v1i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: mul x8, x9, x8 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mul_v1i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %prod = mul <1 x i64> %lhs, %rhs ret <1 x i64> %prod } @@ -2867,7 +3249,7 @@ define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { define <4 x i32> @sqdmlal4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { ; CHECK-LABEL: sqdmlal4s_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2) %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp) @@ -2877,7 +3259,7 @@ define <4 x i32> @sqdmlal4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { define <2 x i64> @sqdmlal2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { ; CHECK-LABEL: sqdmlal2d_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2) %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp) @@ -2887,7 +3269,7 @@ define <2 x i64> @sqdmlal2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { define <4 x i32> @sqdmlal2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { ; CHECK-LABEL: sqdmlal2_4s_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.4s v0, v1, v2 +; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -2899,7 +3281,7 @@ define <4 x i32> @sqdmlal2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) define <2 x i64> @sqdmlal2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { ; CHECK-LABEL: sqdmlal2_2d_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.2d v0, v1, v2 +; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3> %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3> @@ -2912,7 +3294,7 @@ define <4 x i32> @sqdmlal_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> % ; CHECK-LABEL: sqdmlal_lane_4s_lib: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqdmlal.4s v0, v1, v2[3] +; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.h[3] ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0) @@ -2924,7 +3306,7 @@ define <2 x i64> @sqdmlal_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> % ; CHECK-LABEL: sqdmlal_lane_2d_lib: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1] ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0) @@ -2933,10 +3315,16 @@ define <2 x i64> @sqdmlal_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> % } define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { -; CHECK-LABEL: sqdmlal2_lane_4s_lib: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.4s v0, v1, v2[7] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_lane_4s_lib: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlal2 v0.4s, v1.8h, v2.h[7] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_lane_4s_lib: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.h[7] +; CHECK-GI-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) @@ -2945,10 +3333,16 @@ define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> } define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { -; CHECK-LABEL: sqdmlal2_lane_2d_lib: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlal2_lane_2d_lib: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlal2_lane_2d_lib: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1] +; CHECK-GI-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3> %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) @@ -2959,7 +3353,7 @@ define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> define <4 x i32> @sqdmlsl4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { ; CHECK-LABEL: sqdmlsl4s_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2) %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp) @@ -2969,7 +3363,7 @@ define <4 x i32> @sqdmlsl4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) { define <2 x i64> @sqdmlsl2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { ; CHECK-LABEL: sqdmlsl2d_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2) %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp) @@ -2979,7 +3373,7 @@ define <2 x i64> @sqdmlsl2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) { define <4 x i32> @sqdmlsl2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { ; CHECK-LABEL: sqdmlsl2_4s_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2 +; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -2991,7 +3385,7 @@ define <4 x i32> @sqdmlsl2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) define <2 x i64> @sqdmlsl2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { ; CHECK-LABEL: sqdmlsl2_2d_lib: ; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2 +; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3> %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3> @@ -3004,7 +3398,7 @@ define <4 x i32> @sqdmlsl_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> % ; CHECK-LABEL: sqdmlsl_lane_4s_lib: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[3] +; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[3] ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0) @@ -3016,7 +3410,7 @@ define <2 x i64> @sqdmlsl_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> % ; CHECK-LABEL: sqdmlsl_lane_2d_lib: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1] ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0) @@ -3025,10 +3419,16 @@ define <2 x i64> @sqdmlsl_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> % } define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) { -; CHECK-LABEL: sqdmlsl2_lane_4s_lib: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2[7] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_lane_4s_lib: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.h[7] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_lane_4s_lib: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[7] +; CHECK-GI-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1) @@ -3037,10 +3437,16 @@ define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> } define <2 x i64> @sqdmlsl2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) { -; CHECK-LABEL: sqdmlsl2_lane_2d_lib: -; CHECK: // %bb.0: -; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqdmlsl2_lane_2d_lib: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqdmlsl2_lane_2d_lib: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1] +; CHECK-GI-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3> %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1> %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1) |