aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h3
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp101
-rw-r--r--llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll152
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll191
4 files changed, 440 insertions, 7 deletions
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 99df6e5..3d11bf3 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -38,7 +38,8 @@ enum class ComplexDeinterleavingOperation {
CMulPartial,
// The following 'operations' are used to represent internal states. Backends
// are not expected to try and support these in any capacity.
- Shuffle
+ Shuffle,
+ Symmetric
};
enum class ComplexDeinterleavingRotation {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 3945913..fcc25d9 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -254,6 +254,7 @@ private:
/// 270: r: ar + bi
/// i: ai - br
NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
+ NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
NodePtr identifyNode(Instruction *I, Instruction *J);
@@ -702,6 +703,59 @@ static bool isInstructionPairMul(Instruction *A, Instruction *B) {
return match(A, Pattern) && match(B, Pattern);
}
+static bool isInstructionPotentiallySymmetric(Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FNeg:
+ return true;
+ default:
+ return false;
+ }
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
+ Instruction *Imag) {
+ if (Real->getOpcode() != Imag->getOpcode())
+ return nullptr;
+
+ if (!isInstructionPotentiallySymmetric(Real) ||
+ !isInstructionPotentiallySymmetric(Imag))
+ return nullptr;
+
+ auto *R0 = dyn_cast<Instruction>(Real->getOperand(0));
+ auto *I0 = dyn_cast<Instruction>(Imag->getOperand(0));
+
+ if (!R0 || !I0)
+ return nullptr;
+
+ NodePtr Op0 = identifyNode(R0, I0);
+ NodePtr Op1 = nullptr;
+ if (Op0 == nullptr)
+ return nullptr;
+
+ if (Real->isBinaryOp()) {
+ auto *R1 = dyn_cast<Instruction>(Real->getOperand(1));
+ auto *I1 = dyn_cast<Instruction>(Imag->getOperand(1));
+ if (!R1 || !I1)
+ return nullptr;
+
+ Op1 = identifyNode(R1, I1);
+ if (Op1 == nullptr)
+ return nullptr;
+ }
+
+ auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
+ Real, Imag);
+ Node->addOperand(Op0);
+ if (Real->isBinaryOp())
+ Node->addOperand(Op1);
+
+ return submitCompositeNode(Node);
+}
+
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n");
@@ -815,7 +869,10 @@ ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
return identifyAdd(Real, Imag);
}
- return nullptr;
+ auto Symmetric = identifySymmetricOperation(Real, Imag);
+ LLVM_DEBUG(if (Symmetric == nullptr) dbgs()
+ << " - Not recognised as a valid pattern.\n");
+ return Symmetric;
}
bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
@@ -847,21 +904,53 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
return RootNode != nullptr;
}
+static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node,
+ Value *InputA, Value *InputB) {
+ Instruction *I = Node->Real;
+ if (I->isUnaryOp())
+ assert(!InputB &&
+ "Unary symmetric operations need one input, but two were provided.");
+ else if (I->isBinaryOp())
+ assert(InputB && "Binary symmetric operations need two inputs, only one "
+ "was provided.");
+
+ IRBuilder<> B(I);
+
+ switch (I->getOpcode()) {
+ case Instruction::FNeg:
+ return B.CreateFNeg(InputA);
+ case Instruction::FAdd:
+ return B.CreateFAdd(InputA, InputB);
+ case Instruction::FSub:
+ return B.CreateFSub(InputA, InputB);
+ case Instruction::FMul:
+ return B.CreateFMul(InputA, InputB);
+ }
+
+ return nullptr;
+}
+
Value *ComplexDeinterleavingGraph::replaceNode(
ComplexDeinterleavingGraph::RawNodePtr Node) {
if (Node->ReplacementNode)
return Node->ReplacementNode;
Value *Input0 = replaceNode(Node->Operands[0]);
- Value *Input1 = replaceNode(Node->Operands[1]);
+ Value *Input1 =
+ Node->Operands.size() > 1 ? replaceNode(Node->Operands[1]) : nullptr;
Value *Accumulator =
Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr;
- assert(Input0->getType() == Input1->getType() &&
- "Node inputs need to be of the same type");
+ if (Input1)
+ assert(Input0->getType() == Input1->getType() &&
+ "Node inputs need to be of the same type");
- Node->ReplacementNode = TL->createComplexDeinterleavingIR(
- Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
+ if (Node->Operation == ComplexDeinterleavingOperation::Symmetric)
+ Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1);
+ else
+ Node->ReplacementNode = TL->createComplexDeinterleavingIR(
+ Node->Real, Node->Operation, Node->Rotation, Input0, Input1,
+ Accumulator);
assert(Node->ReplacementNode && "Target failed to create Intrinsic call.");
NumComplexTransformations += 1;
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
index d8b30fd..fd98918 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
@@ -353,3 +353,155 @@ entry:
%interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x float> %interleaved.vec
}
+
+; Expected to transform
+define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_addequal:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v3.2d, #0000000000000000
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT: fadd v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fadd fast <2 x float> %5, %c.real
+ %7 = fadd fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_subequal:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v3.2d, #0000000000000000
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT: fsub v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fsub fast <2 x float> %5, %c.real
+ %7 = fsub fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+
+; Expected to transform
+define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mulequal:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v3.2d, #0000000000000000
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT: fmul v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fmul fast <2 x float> %5, %c.real
+ %7 = fmul fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_divequal:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s
+; CHECK-NEXT: zip1 v4.2s, v2.2s, v16.2s
+; CHECK-NEXT: zip2 v2.2s, v2.2s, v16.2s
+; CHECK-NEXT: fmul v7.2s, v6.2s, v5.2s
+; CHECK-NEXT: fneg v3.2s, v7.2s
+; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s
+; CHECK-NEXT: fmul v0.2s, v5.2s, v0.2s
+; CHECK-NEXT: fmla v0.2s, v6.2s, v1.2s
+; CHECK-NEXT: fdiv v3.2s, v3.2s, v4.2s
+; CHECK-NEXT: fdiv v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fdiv fast <2 x float> %5, %c.real
+ %7 = fdiv fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_negequal:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v2.2d, #0000000000000000
+; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT: fneg v0.4s, v2.4s
+; CHECK-NEXT: ret
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %6 = fneg fast <2 x float> %5
+ %7 = fneg fast <2 x float> %2
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
index b16b06b..a529aa8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
@@ -385,3 +385,194 @@ entry:
%interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x float> %interleaved.vec
}
+
+; Expected to transform
+define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_addequal:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
+; CHECK-NEXT: vadd.f32 q0, q3, q1
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fadd fast <2 x float> %5, %c.real
+ %7 = fadd fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_subequal:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
+; CHECK-NEXT: vsub.f32 q0, q3, q1
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fsub fast <2 x float> %5, %c.real
+ %7 = fsub fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+
+; Expected to transform
+define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mulequal:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
+; CHECK-NEXT: vmul.f32 q0, q3, q1
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fmul fast <2 x float> %5, %c.real
+ %7 = fmul fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_divequal:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d10, d11}
+; CHECK-NEXT: vpush {d10, d11}
+; CHECK-NEXT: .vsave {d8}
+; CHECK-NEXT: vpush {d8}
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: add r0, sp, #24
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov.f32 s16, s1
+; CHECK-NEXT: add.w r12, sp, #40
+; CHECK-NEXT: vmov.f32 s12, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmul.f32 q5, q3, q0
+; CHECK-NEXT: vmov.f32 s9, s6
+; CHECK-NEXT: vldrw.u32 q1, [r12]
+; CHECK-NEXT: vmov.f32 s17, s3
+; CHECK-NEXT: vfma.f32 q5, q2, q4
+; CHECK-NEXT: vmul.f32 q3, q4, q3
+; CHECK-NEXT: vdiv.f32 s3, s21, s7
+; CHECK-NEXT: vneg.f32 q3, q3
+; CHECK-NEXT: vfma.f32 q3, q2, q0
+; CHECK-NEXT: vdiv.f32 s1, s20, s5
+; CHECK-NEXT: vdiv.f32 s2, s13, s6
+; CHECK-NEXT: vdiv.f32 s0, s12, s4
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: vpop {d8}
+; CHECK-NEXT: vpop {d10, d11}
+; CHECK-NEXT: bx lr
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %6 = fdiv fast <2 x float> %5, %c.real
+ %7 = fdiv fast <2 x float> %2, %c.imag
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_negequal:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0
+; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90
+; CHECK-NEXT: vneg.f32 q0, q2
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+entry:
+ %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+ %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+ %0 = fmul fast <2 x float> %b.imag, %strided.vec
+ %1 = fmul fast <2 x float> %b.real, %a.imag
+ %2 = fadd fast <2 x float> %1, %0
+ %3 = fmul fast <2 x float> %b.real, %strided.vec
+ %4 = fmul fast <2 x float> %a.imag, %b.imag
+ %5 = fsub fast <2 x float> %3, %4
+ %6 = fneg fast <2 x float> %5
+ %7 = fneg fast <2 x float> %2
+ %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x float> %interleaved.vec
+}