aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRin Dobrescu <irina.dobrescu@arm.com>2024-01-22 11:59:40 +0000
committerGitHub <noreply@github.com>2024-01-22 11:59:40 +0000
commit365aa1574a1b4a3cdee6648227d095d00536ffde (patch)
treef1a28bfd588d21e9631e6f188e00093af3d5352c
parent865e4a1f33bd3be42ff256c6839aff0860610a5a (diff)
downloadllvm-365aa1574a1b4a3cdee6648227d095d00536ffde.zip
llvm-365aa1574a1b4a3cdee6648227d095d00536ffde.tar.gz
llvm-365aa1574a1b4a3cdee6648227d095d00536ffde.tar.bz2
[AArch64] Convert UADDV(add(zext, zext)) into UADDLV(concat). (#78301)
We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into UADDLV(concat), where the concat represents the 64-bit zext sources.
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp48
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll54
-rw-r--r--llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/vecreduce-add.ll155
4 files changed, 179 insertions, 90 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 96ea692..23d37d6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16603,8 +16603,7 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
auto DetectAddExtract = [&](SDValue A) {
// Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
// UADDLP(x) if found.
- if (A.getOpcode() != ISD::ADD)
- return SDValue();
+ assert(A.getOpcode() == ISD::ADD);
EVT VT = A.getValueType();
SDValue Op0 = A.getOperand(0);
SDValue Op1 = A.getOperand(1);
@@ -16647,11 +16646,54 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
return SDValue();
}
+// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
+// UADDLV(concat), where the concat represents the 64-bit zext sources.
+static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
+ // Look for add(zext(64-bit source), zext(64-bit source)), returning
+ // UADDLV(concat(zext, zext)) if found.
+ assert(A.getOpcode() == ISD::ADD);
+ EVT VT = A.getValueType();
+ if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
+ return SDValue();
+ SDValue Op0 = A.getOperand(0);
+ SDValue Op1 = A.getOperand(1);
+ if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
+ return SDValue();
+ SDValue Ext0 = Op0.getOperand(0);
+ SDValue Ext1 = Op1.getOperand(0);
+ EVT ExtVT0 = Ext0.getValueType();
+ EVT ExtVT1 = Ext1.getValueType();
+ // Check zext VTs are the same and 64-bit length.
+ if (ExtVT0 != ExtVT1 ||
+ VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
+ return SDValue();
+ // Get VT for concat of zext sources.
+ EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v2i64:
+ case MVT::v4i32:
+ return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
+ case MVT::v8i16: {
+ SDValue Uaddlv =
+ DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
+ return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
+ }
+ default:
+ llvm_unreachable("Unhandled vector type");
+ }
+}
+
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
SDValue A = N->getOperand(0);
- if (A.getOpcode() == ISD::ADD)
+ if (A.getOpcode() == ISD::ADD) {
if (SDValue R = performUADDVAddCombine(A, DAG))
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
+ else if (SDValue R = performUADDVZextCombine(A, DAG))
+ return R;
+ }
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll
new file mode 100644
index 0000000..b1b9959
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i16 @test_add_zext_v8i16(<8 x i8> %a, <8 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: umov w0, v0.h[0]
+; CHECK-NEXT: ret
+ %z1 = zext <8 x i8> %a to <8 x i16>
+ %z2 = zext <8 x i8> %b to <8 x i16>
+ %z = add <8 x i16> %z1, %z2
+ %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %z)
+ ret i16 %r
+}
+
+define i32 @test_add_zext_v4i32(<4 x i16> %a, <4 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: uaddlv s0, v0.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %z1 = zext <4 x i16> %a to <4 x i32>
+ %z2 = zext <4 x i16> %b to <4 x i32>
+ %z = add <4 x i32> %z1, %z2
+ %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
+ ret i32 %r
+}
+
+define i64 @test_add_zext_v2i64(<2 x i32> %a, <2 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: uaddlv d0, v0.4s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %z1 = zext <2 x i32> %a to <2 x i64>
+ %z2 = zext <2 x i32> %b to <2 x i64>
+ %z = add <2 x i64> %z1, %z2
+ %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %z)
+ ret i64 %r
+}
+
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
index 1fc177f..24cce9a 100644
--- a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -18,14 +18,14 @@ define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <
; CHECK-NEXT: mov v4.s[2], v6.s[0]
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: mov v4.s[3], v3.s[0]
-; CHECK-NEXT: xtn v2.4h, v0.4s
+; CHECK-NEXT: xtn v1.4h, v0.4s
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
-; CHECK-NEXT: xtn v1.4h, v4.4s
+; CHECK-NEXT: xtn v2.4h, v4.4s
; CHECK-NEXT: shrn v3.4h, v4.4s, #16
-; CHECK-NEXT: uhadd v0.4h, v2.4h, v0.4h
-; CHECK-NEXT: uhadd v1.4h, v1.4h, v3.4h
-; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: uhadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: uhadd v1.4h, v2.4h, v3.4h
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%l87 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 0b43e3b..ad82d2e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1434,37 +1434,21 @@ entry:
}
define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-BASE: // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT: addv b0, v0.16b
-; CHECK-SD-BASE-NEXT: fmov w8, s0
-; CHECK-SD-BASE-NEXT: add w8, w8, w0
-; CHECK-SD-BASE-NEXT: and w0, w8, #0xff
-; CHECK-SD-BASE-NEXT: ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-DOT: // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT: addv b0, v0.16b
-; CHECK-SD-DOT-NEXT: fmov w8, s0
-; CHECK-SD-DOT-NEXT: add w8, w8, w0
-; CHECK-SD-DOT-NEXT: and w0, w8, #0xff
-; CHECK-SD-DOT-NEXT: ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: addv b0, v0.16b
-; CHECK-GI-BASE-NEXT: fmov w8, s0
-; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxtb
-; CHECK-GI-BASE-NEXT: and w0, w8, #0xff
-; CHECK-GI-BASE-NEXT: ret
+; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: addv b0, v0.16b
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w8, w8, w0
+; CHECK-SD-NEXT: and w0, w8, #0xff
+; CHECK-SD-NEXT: ret
;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: addv b0, v0.16b
-; CHECK-GI-DOT-NEXT: fmov w8, s0
-; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxtb
-; CHECK-GI-DOT-NEXT: and w0, w8, #0xff
-; CHECK-GI-DOT-NEXT: ret
+; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: addv b0, v0.16b
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w8, w0, w8, uxtb
+; CHECK-GI-NEXT: and w0, w8, #0xff
+; CHECK-GI-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
@@ -1783,8 +1767,10 @@ entry:
define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
@@ -1889,8 +1875,10 @@ entry:
define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -3298,8 +3286,8 @@ define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
@@ -3590,10 +3578,12 @@ entry:
define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -3710,9 +3700,11 @@ entry:
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT: addv h0, v0.8h
-; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv h0, v0.16b
+; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
@@ -4047,8 +4039,8 @@ define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
@@ -5378,33 +5370,19 @@ entry:
}
define i32 @extract_hi_lo(<8 x i16> %a) {
-; CHECK-SD-BASE-LABEL: extract_hi_lo:
-; CHECK-SD-BASE: // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT: fmov w0, s0
-; CHECK-SD-BASE-NEXT: ret
-;
-; CHECK-SD-DOT-LABEL: extract_hi_lo:
-; CHECK-SD-DOT: // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT: uaddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT: fmov w0, s0
-; CHECK-SD-DOT-NEXT: ret
-;
-; CHECK-GI-BASE-LABEL: extract_hi_lo:
-; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: fmov w0, s0
-; CHECK-GI-BASE-NEXT: ret
+; CHECK-SD-LABEL: extract_hi_lo:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
;
-; CHECK-GI-DOT-LABEL: extract_hi_lo:
-; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT: addv s0, v0.4s
-; CHECK-GI-DOT-NEXT: fmov w0, s0
-; CHECK-GI-DOT-NEXT: ret
+; CHECK-GI-LABEL: extract_hi_lo:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -5416,12 +5394,20 @@ entry:
}
define i32 @extract_hi_hi(<8 x i16> %a) {
-; CHECK-LABEL: extract_hi_hi:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v0.8h
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: extract_hi_hi:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: mov v0.d[1], v0.d[0]
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: extract_hi_hi:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: uaddl2 v0.4s, v0.8h, v0.8h
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%z2 = zext <4 x i16> %e2 to <4 x i32>
@@ -5431,12 +5417,19 @@ entry:
}
define i32 @extract_lo_lo(<8 x i16> %a) {
-; CHECK-LABEL: extract_lo_lo:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddl v0.4s, v0.4h, v0.4h
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: extract_lo_lo:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov v0.d[1], v0.d[0]
+; CHECK-SD-NEXT: uaddlv s0, v0.8h
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: extract_lo_lo:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: uaddl v0.4s, v0.4h, v0.4h
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%z1 = zext <4 x i16> %e1 to <4 x i32>