aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Green <david.green@arm.com>2021-01-06 10:08:43 +0000
committerDavid Green <david.green@arm.com>2021-01-06 10:08:43 +0000
commit78d8a821e23e42d13dcbb3467747e480fb889b8a (patch)
tree5ca0d02d2d1802141cf9e7dab686f1458aeab10f
parenta7e3339f3b0eb71e43d44e6f59cc8db6a7b110bf (diff)
downloadllvm-78d8a821e23e42d13dcbb3467747e480fb889b8a.zip
llvm-78d8a821e23e42d13dcbb3467747e480fb889b8a.tar.gz
llvm-78d8a821e23e42d13dcbb3467747e480fb889b8a.tar.bz2
[AArch64] Handle any extend whilst lowering mull
Demanded bits may turn a sext or zext into an anyext if the top bits are not needed. This currently prevents the lowering to instructions like mull, addl and addw. This patch fixes the mull generation by keeping it simple and treating them like zextends. Differential Revision: https://reviews.llvm.org/D93832
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll167
-rw-r--r--llvm/test/CodeGen/AArch64/lowerMUL-newload.ll51
3 files changed, 63 insertions, 159 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b9dc84..41dc285 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3347,7 +3347,8 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
}
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ if (N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
@@ -3377,6 +3378,7 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::ANY_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, false);
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 17a21e5..0a69219 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -96,9 +96,7 @@ define <8 x i16> @amull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A
@@ -115,9 +113,7 @@ define <4 x i32> @amull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -135,16 +131,7 @@ define <2 x i64> @amull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -268,12 +255,10 @@ define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
; CHECK-LABEL: amlal_v8i8_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
-; CHECK-NEXT: mla v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -290,14 +275,12 @@ define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
; CHECK-LABEL: amlal_v4i16_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -313,20 +296,10 @@ define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
; CHECK-LABEL: amlal_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -452,12 +425,10 @@ define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
; CHECK-LABEL: amlsl_v8i8_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
-; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -474,14 +445,12 @@ define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
; CHECK-LABEL: amlsl_v4i16_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mls v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -497,20 +466,10 @@ define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
; CHECK-LABEL: amlsl_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -626,9 +585,8 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; CHECK-LABEL: amull_extvec_v8i8_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: movi v1.8h, #12
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: movi v1.8b, #12
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
@@ -641,9 +599,8 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
; CHECK-LABEL: amull_extvec_v4i16_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #1234
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v1.4h, w8
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -656,14 +613,9 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-LABEL: amull_extvec_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: mov w8, #1234
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x10, x8
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: dup v1.2s, w8
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -800,14 +752,11 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
; CHECK-LABEL: amull2_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: mul v1.8h, v2.8h, v3.8h
+; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEXT: bic v2.8h, #255, lsl #8
; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
@@ -819,15 +768,11 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; CHECK-LABEL: amull2_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: mul v1.4s, v2.4s, v3.4s
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
; CHECK-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
@@ -839,29 +784,11 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; CHECK-LABEL: amull2_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.2d, v0.4s, #0
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: fmov x13, d3
-; CHECK-NEXT: fmov x14, d2
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mov x11, v3.d[1]
-; CHECK-NEXT: mov x12, v2.d[1]
-; CHECK-NEXT: mul x13, x14, x13
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mul x9, x12, x11
-; CHECK-NEXT: fmov d1, x13
-; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: mov v1.d[1], x9
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
; CHECK-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
index 530aeed..b8d0afc 100644
--- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
+++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
@@ -21,10 +21,8 @@ entry:
define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: mlai16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: mla v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: umlal v2.4s, v1.4h, v0.4h
; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
@@ -91,13 +89,10 @@ entry:
define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: addmuli16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
+; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h
+; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <4 x i16> %vec0 to <4 x i32>
@@ -162,20 +157,10 @@ entry:
define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: mlai32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d1, x10
-; CHECK-NEXT: ushll v0.2d, v2.2s, #0
-; CHECK-NEXT: mov v1.d[1], x8
-; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-NEXT: umlal v2.2d, v1.2s, v0.2s
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <2 x i32> %vec0 to <2 x i64>
@@ -240,20 +225,10 @@ entry:
define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: addmuli32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: fmov x9, d2
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v2.d[1]
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: mul x9, x11, x9
-; CHECK-NEXT: mul x8, x10, x8
-; CHECK-NEXT: fmov d0, x9
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <2 x i32> %vec0 to <2 x i64>