aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll248
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll245
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll273
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll273
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll104
5 files changed, 1143 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
new file mode 100644
index 0000000..1c5f713
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; a * b + c
+define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: mull_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d
+; CHECK-NEXT: fmul v3.2d, v0.2d, v4.2d
+; CHECK-NEXT: fneg v1.2d, v1.2d
+; CHECK-NEXT: fmla v3.2d, v2.2d, v5.2d
+; CHECK-NEXT: fmla v1.2d, v2.2d, v0.2d
+; CHECK-NEXT: fadd v3.2d, v3.2d, v4.2d
+; CHECK-NEXT: fadd v1.2d, v2.2d, v1.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d
+; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul contract <2 x double> %strided.vec, %strided.vec31
+ %1 = fmul contract <2 x double> %strided.vec28, %strided.vec30
+ %2 = fadd contract <2 x double> %1, %0
+ %3 = fmul contract <2 x double> %strided.vec, %strided.vec30
+ %4 = fmul contract <2 x double> %strided.vec28, %strided.vec31
+ %5 = fsub contract <2 x double> %3, %4
+ %strided.vec33 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec34 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fadd contract <2 x double> %strided.vec33, %5
+ %7 = fadd contract <2 x double> %2, %strided.vec34
+ %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b + c * d
+define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_add_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: movi v18.2d, #0000000000000000
+; CHECK-NEXT: movi v19.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d
+; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul contract <2 x double> %strided.vec, %strided.vec54
+ %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53
+ %2 = fadd contract <2 x double> %1, %0
+ %3 = fmul contract <2 x double> %strided.vec, %strided.vec53
+ %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54
+ %5 = fsub contract <2 x double> %3, %4
+ %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60
+ %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59
+ %8 = fadd contract <2 x double> %7, %6
+ %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59
+ %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60
+ %11 = fsub contract <2 x double> %9, %10
+ %12 = fadd contract <2 x double> %5, %11
+ %13 = fadd contract <2 x double> %2, %8
+ %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b - c * d
+define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_sub_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: movi v18.2d, #0000000000000000
+; CHECK-NEXT: movi v19.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT: fsub v0.2d, v16.2d, v18.2d
+; CHECK-NEXT: fsub v1.2d, v17.2d, v19.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul contract <2 x double> %strided.vec, %strided.vec54
+ %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53
+ %2 = fadd contract <2 x double> %1, %0
+ %3 = fmul contract <2 x double> %strided.vec, %strided.vec53
+ %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54
+ %5 = fsub contract <2 x double> %3, %4
+ %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60
+ %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59
+ %8 = fadd contract <2 x double> %7, %6
+ %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59
+ %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60
+ %11 = fsub contract <2 x double> %9, %10
+ %12 = fsub contract <2 x double> %5, %11
+ %13 = fsub contract <2 x double> %2, %8
+ %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b + conj(c) * d
+define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_conj_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: movi v18.2d, #0000000000000000
+; CHECK-NEXT: movi v19.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #270
+; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #270
+; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d
+; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul contract <2 x double> %strided.vec, %strided.vec62
+ %1 = fmul contract <2 x double> %strided.vec59, %strided.vec61
+ %2 = fadd contract <2 x double> %1, %0
+ %3 = fmul contract <2 x double> %strided.vec, %strided.vec61
+ %4 = fmul contract <2 x double> %strided.vec59, %strided.vec62
+ %5 = fsub contract <2 x double> %3, %4
+ %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fmul contract <2 x double> %strided.vec64, %strided.vec68
+ %7 = fmul contract <2 x double> %strided.vec65, %strided.vec67
+ %8 = fsub contract <2 x double> %6, %7
+ %9 = fmul contract <2 x double> %strided.vec64, %strided.vec67
+ %10 = fmul contract <2 x double> %strided.vec65, %strided.vec68
+ %11 = fadd contract <2 x double> %9, %10
+ %12 = fadd contract <2 x double> %5, %11
+ %13 = fadd contract <2 x double> %2, %8
+ %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a + b + 1i * c * d
+define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_add_rot_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v16.2d, #0xffffffffffffffff
+; CHECK-NEXT: zip2 v17.2d, v4.2d, v5.2d
+; CHECK-NEXT: movi v18.2d, #0000000000000000
+; CHECK-NEXT: zip1 v19.2d, v0.2d, v1.2d
+; CHECK-NEXT: fneg v16.2d, v16.2d
+; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v5.2d, v2.2d, v3.2d
+; CHECK-NEXT: mov v4.16b, v16.16b
+; CHECK-NEXT: bsl v4.16b, v18.16b, v17.16b
+; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: mov v3.16b, v16.16b
+; CHECK-NEXT: bsl v3.16b, v18.16b, v1.16b
+; CHECK-NEXT: fadd v1.2d, v1.2d, v4.2d
+; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d
+; CHECK-NEXT: zip1 v6.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmul v7.2d, v0.2d, v2.2d
+; CHECK-NEXT: fsub v3.2d, v3.2d, v17.2d
+; CHECK-NEXT: fmul v16.2d, v1.2d, v4.2d
+; CHECK-NEXT: fmul v2.2d, v19.2d, v2.2d
+; CHECK-NEXT: fneg v7.2d, v7.2d
+; CHECK-NEXT: fmul v4.2d, v3.2d, v4.2d
+; CHECK-NEXT: fneg v16.2d, v16.2d
+; CHECK-NEXT: fmla v2.2d, v5.2d, v0.2d
+; CHECK-NEXT: fmla v7.2d, v5.2d, v19.2d
+; CHECK-NEXT: fmla v4.2d, v1.2d, v6.2d
+; CHECK-NEXT: fmla v16.2d, v6.2d, v3.2d
+; CHECK-NEXT: fadd v1.2d, v2.2d, v4.2d
+; CHECK-NEXT: fadd v2.2d, v7.2d, v16.2d
+; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d
+; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec77 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec79 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec80 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul contract <2 x double> %strided.vec, %strided.vec80
+ %1 = fmul contract <2 x double> %strided.vec77, %strided.vec79
+ %2 = fadd contract <2 x double> %1, %0
+ %3 = fmul contract <2 x double> %strided.vec, %strided.vec79
+ %4 = fmul contract <2 x double> %strided.vec77, %strided.vec80
+ %5 = fsub contract <2 x double> %3, %4
+ %strided.vec82 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec83 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %6 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec83)
+ %7 = fadd contract <2 x double> %strided.vec82, %6
+ %8 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec82)
+ %9 = fsub contract <2 x double> %8, %strided.vec83
+ %strided.vec85 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec86 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %10 = fmul contract <2 x double> %9, %strided.vec86
+ %11 = fmul contract <2 x double> %strided.vec85, %7
+ %12 = fadd contract <2 x double> %11, %10
+ %13 = fmul contract <2 x double> %9, %strided.vec85
+ %14 = fmul contract <2 x double> %7, %strided.vec86
+ %15 = fsub contract <2 x double> %13, %14
+ %16 = fadd contract <2 x double> %5, %15
+ %17 = fadd contract <2 x double> %2, %12
+ %interleaved.vec = shufflevector <2 x double> %16, <2 x double> %17, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
new file mode 100644
index 0000000..577c3ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
@@ -0,0 +1,245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; a * b + c
+define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: mull_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d
+; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d
+; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d
+; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d
+; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec31, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
+ %2 = fadd fast <2 x double> %0, %1
+ %3 = fmul fast <2 x double> %strided.vec30, %strided.vec
+ %strided.vec33 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec34 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %4 = fadd fast <2 x double> %strided.vec33, %3
+ %5 = fmul fast <2 x double> %strided.vec31, %strided.vec28
+ %6 = fsub fast <2 x double> %4, %5
+ %7 = fadd fast <2 x double> %2, %strided.vec34
+ %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b + c * d
+define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_add_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d
+; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d
+; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
+; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d
+; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d
+; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d
+; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d
+; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d
+; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d
+; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d
+; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec54, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec53, %strided.vec51
+ %2 = fmul fast <2 x double> %strided.vec53, %strided.vec
+ %3 = fmul fast <2 x double> %strided.vec54, %strided.vec51
+ %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %4 = fmul fast <2 x double> %strided.vec60, %strided.vec56
+ %5 = fmul fast <2 x double> %strided.vec59, %strided.vec57
+ %6 = fmul fast <2 x double> %strided.vec59, %strided.vec56
+ %7 = fmul fast <2 x double> %strided.vec60, %strided.vec57
+ %8 = fadd fast <2 x double> %7, %3
+ %9 = fadd fast <2 x double> %6, %2
+ %10 = fsub fast <2 x double> %9, %8
+ %11 = fadd fast <2 x double> %0, %1
+ %12 = fadd fast <2 x double> %11, %5
+ %13 = fadd fast <2 x double> %12, %4
+ %interleaved.vec = shufflevector <2 x double> %10, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b - c * d
+define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_sub_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d
+; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d
+; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d
+; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d
+; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d
+; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d
+; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d
+; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d
+; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d
+; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d
+; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d
+; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec53 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec55 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec56 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec56, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec55, %strided.vec53
+ %2 = fmul fast <2 x double> %strided.vec55, %strided.vec
+ %3 = fmul fast <2 x double> %strided.vec56, %strided.vec53
+ %strided.vec58 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec59 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec61 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec62 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %4 = fmul fast <2 x double> %strided.vec62, %strided.vec59
+ %5 = fmul fast <2 x double> %strided.vec61, %strided.vec58
+ %6 = fadd fast <2 x double> %5, %3
+ %7 = fadd fast <2 x double> %4, %2
+ %8 = fsub fast <2 x double> %7, %6
+ %9 = fmul fast <2 x double> %strided.vec61, %strided.vec59
+ %10 = fmul fast <2 x double> %strided.vec62, %strided.vec58
+ %11 = fadd fast <2 x double> %10, %9
+ %12 = fadd fast <2 x double> %0, %1
+ %13 = fsub fast <2 x double> %12, %11
+ %interleaved.vec = shufflevector <2 x double> %8, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a * b + conj(c) * d
+define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_conj_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d
+; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d
+; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d
+; CHECK-NEXT: fneg v3.2d, v3.2d
+; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d
+; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d
+; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d
+; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d
+; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d
+; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d
+; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d
+; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec62, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec61, %strided.vec59
+ %2 = fmul fast <2 x double> %strided.vec61, %strided.vec
+ %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %3 = fmul fast <2 x double> %strided.vec68, %strided.vec64
+ %4 = fmul fast <2 x double> %strided.vec67, %strided.vec64
+ %5 = fmul fast <2 x double> %strided.vec68, %strided.vec65
+ %6 = fmul fast <2 x double> %strided.vec62, %strided.vec59
+ %7 = fsub fast <2 x double> %2, %6
+ %8 = fadd fast <2 x double> %7, %4
+ %9 = fadd fast <2 x double> %8, %5
+ %10 = fadd fast <2 x double> %0, %1
+ %11 = fmul fast <2 x double> %strided.vec67, %strided.vec65
+ %12 = fsub fast <2 x double> %10, %11
+ %13 = fadd fast <2 x double> %12, %3
+ %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
+
+; a + b + 1i * c * d
+define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; CHECK-LABEL: mul_add_rot_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v18.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmul v19.2d, v16.2d, v17.2d
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d
+; CHECK-NEXT: fmul v4.2d, v2.2d, v17.2d
+; CHECK-NEXT: zip2 v5.2d, v6.2d, v7.2d
+; CHECK-NEXT: fmla v19.2d, v3.2d, v18.2d
+; CHECK-NEXT: fmla v4.2d, v0.2d, v16.2d
+; CHECK-NEXT: fmla v19.2d, v1.2d, v5.2d
+; CHECK-NEXT: fmla v4.2d, v1.2d, v18.2d
+; CHECK-NEXT: fneg v1.2d, v19.2d
+; CHECK-NEXT: fmls v4.2d, v3.2d, v5.2d
+; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d
+; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec79 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec81 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec82 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec82, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec81, %strided.vec79
+ %2 = fmul fast <2 x double> %strided.vec81, %strided.vec
+ %3 = fmul fast <2 x double> %strided.vec82, %strided.vec79
+ %strided.vec84 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec85 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec87 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec88 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %4 = fmul fast <2 x double> %strided.vec87, %strided.vec84
+ %5 = fmul fast <2 x double> %strided.vec87, %strided.vec85
+ %6 = fmul fast <2 x double> %strided.vec88, %strided.vec84
+ %7 = fadd fast <2 x double> %5, %3
+ %8 = fadd fast <2 x double> %7, %6
+ %9 = fsub fast <2 x double> %2, %8
+ %10 = fadd fast <2 x double> %0, %1
+ %11 = fadd fast <2 x double> %10, %4
+ %12 = fmul fast <2 x double> %strided.vec88, %strided.vec85
+ %13 = fsub fast <2 x double> %11, %12
+ %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
new file mode 100644
index 0000000..79ffe69
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; a * b + c
+define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
+; CHECK-LABEL: mull_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp2 z6.d, z2.d, z3.d
+; CHECK-NEXT: uzp2 z7.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: fmul z2.d, z0.d, z6.d
+; CHECK-NEXT: fmla z2.d, p0/m, z7.d, z1.d
+; CHECK-NEXT: fmul z3.d, z7.d, z6.d
+; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z3.d
+; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z3.d, z4.d, z5.d
+; CHECK-NEXT: fadd z3.d, z3.d, z0.d
+; CHECK-NEXT: fadd z1.d, z2.d, z1.d
+; CHECK-NEXT: zip1 z0.d, z3.d, z1.d
+; CHECK-NEXT: zip2 z1.d, z3.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
+ %4 = fmul contract <vscale x 2 x double> %0, %3
+ %5 = fmul contract <vscale x 2 x double> %1, %2
+ %6 = fadd contract <vscale x 2 x double> %5, %4
+ %7 = fmul contract <vscale x 2 x double> %0, %2
+ %8 = fmul contract <vscale x 2 x double> %1, %3
+ %9 = fsub contract <vscale x 2 x double> %7, %8
+ %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
+ %12 = fadd contract <vscale x 2 x double> %10, %9
+ %13 = fadd contract <vscale x 2 x double> %6, %11
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b + c * d
+define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_add_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.d, #0 // =0x0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
+; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
+; CHECK-NEXT: fadd z0.d, z25.d, z27.d
+; CHECK-NEXT: fadd z1.d, z26.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
+ %4 = fmul contract <vscale x 2 x double> %0, %3
+ %5 = fmul contract <vscale x 2 x double> %1, %2
+ %6 = fadd contract <vscale x 2 x double> %5, %4
+ %7 = fmul contract <vscale x 2 x double> %0, %2
+ %8 = fmul contract <vscale x 2 x double> %1, %3
+ %9 = fsub contract <vscale x 2 x double> %7, %8
+ %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
+ %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
+ %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
+ %14 = fmul contract <vscale x 2 x double> %10, %13
+ %15 = fmul contract <vscale x 2 x double> %11, %12
+ %16 = fadd contract <vscale x 2 x double> %15, %14
+ %17 = fmul contract <vscale x 2 x double> %10, %12
+ %18 = fmul contract <vscale x 2 x double> %11, %13
+ %19 = fsub contract <vscale x 2 x double> %17, %18
+ %20 = fadd contract <vscale x 2 x double> %9, %19
+ %21 = fadd contract <vscale x 2 x double> %6, %16
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b - c * d
+define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_sub_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.d, #0 // =0x0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
+; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
+; CHECK-NEXT: fsub z0.d, z25.d, z27.d
+; CHECK-NEXT: fsub z1.d, z26.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
+ %4 = fmul contract <vscale x 2 x double> %0, %3
+ %5 = fmul contract <vscale x 2 x double> %1, %2
+ %6 = fadd contract <vscale x 2 x double> %5, %4
+ %7 = fmul contract <vscale x 2 x double> %0, %2
+ %8 = fmul contract <vscale x 2 x double> %1, %3
+ %9 = fsub contract <vscale x 2 x double> %7, %8
+ %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
+ %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
+ %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
+ %14 = fmul contract <vscale x 2 x double> %10, %13
+ %15 = fmul contract <vscale x 2 x double> %11, %12
+ %16 = fadd contract <vscale x 2 x double> %15, %14
+ %17 = fmul contract <vscale x 2 x double> %10, %12
+ %18 = fmul contract <vscale x 2 x double> %11, %13
+ %19 = fsub contract <vscale x 2 x double> %17, %18
+ %20 = fsub contract <vscale x 2 x double> %9, %19
+ %21 = fsub contract <vscale x 2 x double> %6, %16
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b + conj(c) * d
+define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_conj_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.d, #0 // =0x0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0
+; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270
+; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270
+; CHECK-NEXT: fadd z0.d, z25.d, z27.d
+; CHECK-NEXT: fadd z1.d, z26.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
+ %4 = fmul contract <vscale x 2 x double> %0, %3
+ %5 = fmul contract <vscale x 2 x double> %1, %2
+ %6 = fadd contract <vscale x 2 x double> %5, %4
+ %7 = fmul contract <vscale x 2 x double> %0, %2
+ %8 = fmul contract <vscale x 2 x double> %1, %3
+ %9 = fsub contract <vscale x 2 x double> %7, %8
+ %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
+ %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
+ %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
+ %14 = fmul contract <vscale x 2 x double> %10, %13
+ %15 = fmul contract <vscale x 2 x double> %11, %12
+ %16 = fsub contract <vscale x 2 x double> %14, %15
+ %17 = fmul contract <vscale x 2 x double> %10, %12
+ %18 = fmul contract <vscale x 2 x double> %11, %13
+ %19 = fadd contract <vscale x 2 x double> %17, %18
+ %20 = fadd contract <vscale x 2 x double> %9, %19
+ %21 = fadd contract <vscale x 2 x double> %6, %16
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a + b + 1i * c * d
+define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_add_rot_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp2 z24.d, z4.d, z5.d
+; CHECK-NEXT: mov z26.d, #0 // =0x0
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: and z26.d, z26.d, #0x7fffffffffffffff
+; CHECK-NEXT: and z25.d, z25.d, #0x8000000000000000
+; CHECK-NEXT: uzp2 z27.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d
+; CHECK-NEXT: orr z5.d, z26.d, z25.d
+; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT: fadd z5.d, z1.d, z5.d
+; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000
+; CHECK-NEXT: orr z1.d, z26.d, z1.d
+; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT: fsub z1.d, z1.d, z24.d
+; CHECK-NEXT: uzp2 z24.d, z6.d, z7.d
+; CHECK-NEXT: fmul z3.d, z0.d, z2.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp1 z6.d, z6.d, z7.d
+; CHECK-NEXT: fmul z7.d, z1.d, z24.d
+; CHECK-NEXT: fmla z3.d, p0/m, z27.d, z4.d
+; CHECK-NEXT: fmla z7.d, p0/m, z6.d, z5.d
+; CHECK-NEXT: fmul z2.d, z27.d, z2.d
+; CHECK-NEXT: fmul z5.d, z5.d, z24.d
+; CHECK-NEXT: fnmsb z0.d, p0/m, z4.d, z2.d
+; CHECK-NEXT: fnmsb z1.d, p0/m, z6.d, z5.d
+; CHECK-NEXT: fadd z1.d, z0.d, z1.d
+; CHECK-NEXT: fadd z2.d, z3.d, z7.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 1
+ %4 = fmul contract <vscale x 2 x double> %0, %3
+ %5 = fmul contract <vscale x 2 x double> %1, %2
+ %6 = fadd contract <vscale x 2 x double> %5, %4
+ %7 = fmul contract <vscale x 2 x double> %0, %2
+ %8 = fmul contract <vscale x 2 x double> %1, %3
+ %9 = fsub contract <vscale x 2 x double> %7, %8
+ %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
+ %12 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %11)
+ %13 = fadd contract <vscale x 2 x double> %10, %12
+ %14 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %10)
+ %15 = fsub contract <vscale x 2 x double> %14, %11
+ %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %16 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
+ %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
+ %18 = fmul contract <vscale x 2 x double> %15, %17
+ %19 = fmul contract <vscale x 2 x double> %16, %13
+ %20 = fadd contract <vscale x 2 x double> %19, %18
+ %21 = fmul contract <vscale x 2 x double> %15, %16
+ %22 = fmul contract <vscale x 2 x double> %13, %17
+ %23 = fsub contract <vscale x 2 x double> %21, %22
+ %24 = fadd contract <vscale x 2 x double> %9, %23
+ %25 = fadd contract <vscale x 2 x double> %6, %20
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
new file mode 100644
index 0000000..f801a1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; a * b + c
+define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
+; CHECK-LABEL: mull_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d
+; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT: movprfx z5, z6
+; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d
+; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z3.d
+; CHECK-NEXT: zip2 z1.d, z1.d, z3.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
+ %4 = fmul fast <vscale x 2 x double> %3, %0
+ %5 = fmul fast <vscale x 2 x double> %2, %1
+ %6 = fadd fast <vscale x 2 x double> %4, %5
+ %7 = fmul fast <vscale x 2 x double> %2, %0
+ %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
+ %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
+ %10 = fadd fast <vscale x 2 x double> %8, %7
+ %11 = fmul fast <vscale x 2 x double> %3, %1
+ %12 = fsub fast <vscale x 2 x double> %10, %11
+ %13 = fadd fast <vscale x 2 x double> %6, %9
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b + c * d
+define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_add_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
+; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d
+; CHECK-NEXT: fmul z1.d, z1.d, z25.d
+; CHECK-NEXT: fmul z0.d, z24.d, z0.d
+; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d
+; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d
+; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d
+; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d
+; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d
+; CHECK-NEXT: fsub z1.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
+ %4 = fmul fast <vscale x 2 x double> %3, %0
+ %5 = fmul fast <vscale x 2 x double> %2, %1
+ %6 = fmul fast <vscale x 2 x double> %2, %0
+ %7 = fmul fast <vscale x 2 x double> %3, %1
+ %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
+ %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
+ %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
+ %12 = fmul fast <vscale x 2 x double> %11, %8
+ %13 = fmul fast <vscale x 2 x double> %10, %9
+ %14 = fmul fast <vscale x 2 x double> %10, %8
+ %15 = fmul fast <vscale x 2 x double> %11, %9
+ %16 = fadd fast <vscale x 2 x double> %15, %7
+ %17 = fadd fast <vscale x 2 x double> %14, %6
+ %18 = fsub fast <vscale x 2 x double> %17, %16
+ %19 = fadd fast <vscale x 2 x double> %4, %5
+ %20 = fadd fast <vscale x 2 x double> %19, %13
+ %21 = fadd fast <vscale x 2 x double> %20, %12
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b - c * d
+define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_sub_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
+; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: fmul z1.d, z1.d, z25.d
+; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d
+; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d
+; CHECK-NEXT: fmul z0.d, z24.d, z0.d
+; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d
+; CHECK-NEXT: fmul z3.d, z5.d, z3.d
+; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d
+; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d
+; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEXT: fsub z1.d, z1.d, z0.d
+; CHECK-NEXT: fsub z2.d, z2.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
+ %4 = fmul fast <vscale x 2 x double> %3, %0
+ %5 = fmul fast <vscale x 2 x double> %2, %1
+ %6 = fmul fast <vscale x 2 x double> %2, %0
+ %7 = fmul fast <vscale x 2 x double> %3, %1
+ %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
+ %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
+ %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 1
+ %12 = fmul fast <vscale x 2 x double> %11, %9
+ %13 = fmul fast <vscale x 2 x double> %10, %8
+ %14 = fadd fast <vscale x 2 x double> %13, %7
+ %15 = fadd fast <vscale x 2 x double> %12, %6
+ %16 = fsub fast <vscale x 2 x double> %15, %14
+ %17 = fmul fast <vscale x 2 x double> %10, %9
+ %18 = fmul fast <vscale x 2 x double> %11, %8
+ %19 = fadd fast <vscale x 2 x double> %18, %17
+ %20 = fadd fast <vscale x 2 x double> %4, %5
+ %21 = fsub fast <vscale x 2 x double> %20, %19
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a * b + conj(c) * d
+define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_conj_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
+; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmul z0.d, z24.d, z0.d
+; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d
+; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d
+; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d
+; CHECK-NEXT: movprfx z1, z2
+; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d
+; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d
+; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d
+; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d
+; CHECK-NEXT: zip1 z0.d, z3.d, z1.d
+; CHECK-NEXT: zip2 z1.d, z3.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
+ %4 = fmul fast <vscale x 2 x double> %3, %0
+ %5 = fmul fast <vscale x 2 x double> %2, %1
+ %6 = fmul fast <vscale x 2 x double> %2, %0
+ %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
+ %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
+ %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
+ %11 = fmul fast <vscale x 2 x double> %10, %7
+ %12 = fmul fast <vscale x 2 x double> %9, %7
+ %13 = fmul fast <vscale x 2 x double> %10, %8
+ %14 = fmul fast <vscale x 2 x double> %3, %1
+ %15 = fsub fast <vscale x 2 x double> %6, %14
+ %16 = fadd fast <vscale x 2 x double> %15, %12
+ %17 = fadd fast <vscale x 2 x double> %16, %13
+ %18 = fadd fast <vscale x 2 x double> %4, %5
+ %19 = fmul fast <vscale x 2 x double> %9, %8
+ %20 = fsub fast <vscale x 2 x double> %18, %19
+ %21 = fadd fast <vscale x 2 x double> %20, %11
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+; a + b + 1i * c * d
+define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
+; CHECK-LABEL: mul_add_rot_mull:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
+; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEXT: fmul z0.d, z24.d, z0.d
+; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
+; CHECK-NEXT: uzp1 z24.d, z6.d, z7.d
+; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT: fmla z0.d, p0/m, z24.d, z3.d
+; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d
+; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z4.d
+; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d
+; CHECK-NEXT: fmls z2.d, p0/m, z5.d, z3.d
+; CHECK-NEXT: fnmsb z1.d, p0/m, z25.d, z0.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+ %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+ %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+ %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+ %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
+ %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
+ %4 = fmul fast <vscale x 2 x double> %3, %0
+ %5 = fmul fast <vscale x 2 x double> %2, %1
+ %6 = fmul fast <vscale x 2 x double> %2, %0
+ %7 = fmul fast <vscale x 2 x double> %3, %1
+ %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+ %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
+ %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
+ %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+ %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 0
+ %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 1
+ %12 = fmul fast <vscale x 2 x double> %10, %8
+ %13 = fmul fast <vscale x 2 x double> %10, %9
+ %14 = fmul fast <vscale x 2 x double> %11, %8
+ %15 = fadd fast <vscale x 2 x double> %13, %7
+ %16 = fadd fast <vscale x 2 x double> %15, %14
+ %17 = fsub fast <vscale x 2 x double> %6, %16
+ %18 = fadd fast <vscale x 2 x double> %4, %5
+ %19 = fadd fast <vscale x 2 x double> %18, %12
+ %20 = fmul fast <vscale x 2 x double> %11, %9
+ %21 = fsub fast <vscale x 2 x double> %19, %20
+ %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+ ret <vscale x 4 x double> %interleaved.vec
+}
+
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
index 4d84636..9409bb9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
@@ -293,3 +293,107 @@ entry:
ret <4 x float> %interleaved.vec136
}
+; Expected to transform. Shows that composite common subexpression is not generated twice.
+; u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]);
+; v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]);
+define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) {
+; CHECK-LABEL: mul_add_common_mul_add_mul:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldp q17, q16, [sp, #96]
+; CHECK-NEXT: zip2 v20.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip2 v21.2d, v6.2d, v7.2d
+; CHECK-NEXT: zip1 v4.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
+; CHECK-NEXT: ldp q19, q18, [sp, #64]
+; CHECK-NEXT: zip2 v23.2d, v17.2d, v16.2d
+; CHECK-NEXT: fmul v6.2d, v21.2d, v20.2d
+; CHECK-NEXT: zip1 v16.2d, v17.2d, v16.2d
+; CHECK-NEXT: zip2 v22.2d, v19.2d, v18.2d
+; CHECK-NEXT: zip1 v18.2d, v19.2d, v18.2d
+; CHECK-NEXT: fneg v6.2d, v6.2d
+; CHECK-NEXT: fmul v20.2d, v5.2d, v20.2d
+; CHECK-NEXT: fmul v7.2d, v22.2d, v23.2d
+; CHECK-NEXT: fmla v6.2d, v4.2d, v5.2d
+; CHECK-NEXT: zip2 v5.2d, v2.2d, v3.2d
+; CHECK-NEXT: fneg v7.2d, v7.2d
+; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: fmla v7.2d, v18.2d, v16.2d
+; CHECK-NEXT: fadd v19.2d, v7.2d, v6.2d
+; CHECK-NEXT: fmla v20.2d, v4.2d, v21.2d
+; CHECK-NEXT: zip2 v4.2d, v0.2d, v1.2d
+; CHECK-NEXT: ldp q7, q6, [sp]
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: fmla v20.2d, v18.2d, v23.2d
+; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d
+; CHECK-NEXT: fmla v20.2d, v22.2d, v16.2d
+; CHECK-NEXT: mov v3.16b, v19.16b
+; CHECK-NEXT: fmla v1.2d, v0.2d, v5.2d
+; CHECK-NEXT: fmla v3.2d, v4.2d, v5.2d
+; CHECK-NEXT: ldp q16, q4, [sp, #32]
+; CHECK-NEXT: fneg v17.2d, v3.2d
+; CHECK-NEXT: zip1 v3.2d, v7.2d, v6.2d
+; CHECK-NEXT: zip2 v6.2d, v7.2d, v6.2d
+; CHECK-NEXT: zip1 v5.2d, v16.2d, v4.2d
+; CHECK-NEXT: fmla v17.2d, v0.2d, v2.2d
+; CHECK-NEXT: fsub v18.2d, v1.2d, v20.2d
+; CHECK-NEXT: zip2 v0.2d, v16.2d, v4.2d
+; CHECK-NEXT: fmla v19.2d, v3.2d, v5.2d
+; CHECK-NEXT: st2 { v17.2d, v18.2d }, [x0]
+; CHECK-NEXT: fmls v19.2d, v6.2d, v0.2d
+; CHECK-NEXT: fmla v20.2d, v6.2d, v5.2d
+; CHECK-NEXT: fmla v20.2d, v3.2d, v0.2d
+; CHECK-NEXT: st2 { v19.2d, v20.2d }, [x1]
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x double> %strided.vec125, %strided.vec
+ %1 = fmul fast <2 x double> %strided.vec126, %strided.vec
+ %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123
+ %3 = fadd fast <2 x double> %1, %2
+ %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128
+ %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129
+ %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128
+ %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129
+ %8 = fsub fast <2 x double> %4, %5
+ %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134
+ %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135
+ %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134
+ %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138
+ %13 = fsub fast <2 x double> %11, %12
+ %14 = fadd fast <2 x double> %13, %8
+ %15 = fadd fast <2 x double> %6, %7
+ %16 = fadd fast <2 x double> %15, %9
+ %17 = fadd fast <2 x double> %16, %10
+ %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123
+ %19 = fadd fast <2 x double> %18, %14
+ %20 = fsub fast <2 x double> %0, %19
+ %21 = fsub fast <2 x double> %3, %17
+ %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ store <4 x double> %interleaved.vec, ptr %p1, align 8
+ %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+ %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+ %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140
+ %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140
+ %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141
+ %25 = fadd fast <2 x double> %22, %14
+ %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141
+ %27 = fsub fast <2 x double> %25, %26
+ %28 = fadd fast <2 x double> %24, %17
+ %29 = fadd fast <2 x double> %28, %23
+ %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ store <4 x double> %interleaved.vec145, ptr %p2, align 8
+ ret void
+}